Expand Code
# Load Libraries
start_time_all <- Sys.time()
options(warn = 1) # output warnings as they appear for traceability in stdout/stderr record
knitr::opts_chunk$set(echo = TRUE, warning = FALSE) # warnings will go to console
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(batchreporter)
quiet_library(Matrix) # dependency of H5weaver
quiet_library(rhdf5) # dependency of H5weaver
quiet_library(H5weaver) # aifi package
quiet_library(HTOparser) # aifi package
quiet_library(ggplot2)
quiet_library(dplyr) # data wrangling
quiet_library(cowplot) # arranging multiple plots
quiet_library(gt) # formatted table output
quiet_library(plotly) # interactive plots
quiet_library(tidyr) # data wrangling
quiet_library(jsonlite) # reading and writing json metadata files
quiet_library(purrr)
quiet_library(future) # multi-threading for batch umap creation
quiet_library(future.apply) # multi-threading for batch umap creation
quiet_library(Seurat)
stm("Starting NGS Batch Report")
stm(paste(c("\t",paste(names(Sys.info()),Sys.info(),sep = ": ")), collapse = "\n\t"))
Argument Parsing
# give input directory rna-specific name
if(is.null(params$in_dir)) {
batch_id <- "X070"
in_dir <- system.file("extdata/X070", package = "batchreporter")
in_key <- system.file("extdata/example_sample_key_X070.csv", package = "batchreporter")
in_config <- system.file("extdata/default_rna_config_v1.csv", package = "batchreporter")
in_batch_meta <- system.file("extdata/batch-metadata.json", package = "batchreporter")
in_method_string <- "scrna;scatac;adt;hto"
out_dir <- tempdir()
} else {
batch_id <- params$batch_id
in_method_string <- params$in_method
in_dir <- params$in_dir
in_key <- params$in_key
in_config <- params$in_config
if(is.null(in_config)){
in_config <- system.file("extdata/default_rna_config_v1.csv", package = "batchreporter")
stm("No config file provided. Using default_rna_config_v1.csv for pbmc data")
}
out_dir <- params$out_dir
in_batch_meta <- params$in_batch_meta
if(is.null(in_batch_meta)){in_batch_meta <- NA}
}
stm(paste0("IN Batch : ", batch_id))
stm(paste0("IN Method : ", in_method_string))
stm(paste0("IN Directory : ", in_dir))
stm(paste0("IN Sample Key : ", in_key))
stm(paste0("IN Config : ", in_config))
stm(paste0("IN Batch Processing Info : ", in_batch_meta))
stm(paste0("OUT Dir : ", out_dir))
print(paste0("IN Batch : ", batch_id))
## [1] "IN Batch : X070"
print(paste0("IN Method : ", in_method_string))
## [1] "IN Method : scrna;scatac;adt;hto"
print(paste0("IN Directory : ", in_dir))
## [1] "IN Directory : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070"
print(paste0("IN Sample Key : ", in_key))
## [1] "IN Sample Key : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/example_sample_key_X070.csv"
print(paste0("IN Config : ", in_config))
## [1] "IN Config : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/default_rna_config_v1.csv"
print(paste0("IN Batch Processing Info : ", in_batch_meta))
## [1] "IN Batch Processing Info : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/batch-metadata.json"
print(paste0("OUT Dir : ", out_dir))
## [1] "OUT Dir : /var/folders/zc/cdsbwh5d60d_2h9c9brv90s80000gp/T//Rtmpo06rqu"
Check input files
if(!dir.exists(in_dir)) {
stm(paste("ERROR: Cannot find IN results dir:", in_dir))
stop()
}
if(!file.exists(in_key)) {
stm(paste("ERROR: Cannot find IN sample key:", in_key))
stop()
}
if(!file.exists(in_config)) {
stm(paste("ERROR: Cannot find IN config file:", in_config))
stop()
}
if(!is.na(in_batch_meta) && !file.exists(in_batch_meta)) {
stm(paste("ERROR: Cannot find IN batch meta:", in_batch_meta))
stop()
}
if(!dir.exists(out_dir)) {
stm(paste("Creating output directory:", out_dir))
dir.create(out_dir)
}
out_prefix <- file.path(out_dir, paste0(batch_id, "_"))
Read in the sample key
stm("Reading in sample key")
df_key <- data.table::fread(in_key)
has_controls <- any(grepl("Control", df_key$Type)) # used to control evaluation of batch control-related code chunks
assertthat::assert_that(length(unique(df_key$BatchID)) == 1, msg = "More than 1 batch in input sample key file")
## [1] TRUE
assertthat::assert_that(batch_id == unique(df_key$BatchID), msg = "Batch in input key file does not match input batch value")
## [1] TRUE
Determine which modalities streams were run
defined_modalities <- c("scrna", "scatac", "adt", "hto")
# convert method string to vector
in_method <- strsplit(in_method_string, split = ";")[[1]]
in_method <- tolower(in_method)
# Logic check input methods
if(!all(in_method %in% defined_modalities)){
unknowns <- setdiff(in_method, defined_modalities)
stop(sprintf("One or more input methods are not in defined modalities: '%s'. Defined modalities are: [%s]. Input methods should be passed as a ';'-delimited string, ie 'scrna;scatac;hto'.",
paste(unknowns, collapse = "', '"),
paste(defined_modalities, collapse = ', ')))
}
has_rna <- "scrna" %in% in_method
has_atac <- "scatac" %in% in_method
has_adt <- "adt" %in% in_method
has_hto <- "hto" %in% in_method
Define and check input folder expectations
if(has_rna){
in_rna <- file.path(in_dir, "scrna")
if(!dir.exists(in_rna)){
stop(sprintf("Expected RNA input directory [%s] does not exist.", in_rna))
}
}
if(has_atac){
in_atac <- file.path(in_dir, "scatac")
if(!dir.exists(in_atac)){
stop(sprintf("Expected ATAC input directory [%s] does not exist.", in_atac))
}
}
if(has_adt){
in_adt <- file.path(in_dir, "adt")
if(!dir.exists(in_adt)){
stop(sprintf("Expected ADT input directory [%s] does not exist.", in_adt))
}
}
if(has_hto){
in_hto <- file.path(in_dir, "hto")
if(!dir.exists(in_hto)){
stop(sprintf("Expected HTO input directory [%s] does not exist.", in_hto))
}
}
stm("Constructing Batch Information table")
# Summarize batch information, also declare some global batch variables that are used throughout the report
pools <- unique(df_key$PoolID)
n_pools <- length(pools)
if(n_pools==1 & grepl("P0$", pools)){
n_pools_label <- 0
} else {
n_pools_label <- n_pools
}
wells <- unlist(strsplit(unique(df_key$WellID), split = ";"))
n_wells <- length(wells)
samples <- unique(df_key$SampleID)
n_samples <- length(samples)
controls <- unique(df_key$SampleID[df_key$Type == "Control"])
control_string <- ifelse(has_controls, paste(controls, collapse = ", "), "None")
if (has_controls){
n_study_samples <- setdiff(samples, controls)
} else {
n_study_samples <- samples
}
n_study_samples <- length(n_study_samples)
samples_pool <- tapply(df_key$SampleID, df_key$PoolID, unique)
samples_pool_string <- sapply(samples_pool, function(x){paste(x, collapse = ", ")})
labels <- c("Batch","Libraries","N Samples", "N Pools","N Wells","Batch Control", paste0(pools, " Samples"))
values <- c(batch_id, in_method_string, n_study_samples, n_pools_label, n_wells, control_string, samples_pool_string)
simple_html_table(labels, values, fontsize = 3, col_widths_px = c(175, 850))
| Batch | X070 |
| Libraries | scrna;scatac;adt;hto |
| N Samples | 6 |
| N Pools | 1 |
| N Wells | 3 |
| Batch Control | None |
| X070-P1 Samples | PB1051W10, PB1178W10, PB1194W10, PB2216W10, PB5206W10, PB7626W10 |
if(!is.na(in_batch_meta)){
if(!file.exists(in_batch_meta)) {stop(sprintf("Supplied in_batch_meta file '%s' does not exist. ",in_batch_meta))}
meta_table <- jsonlite::read_json(in_batch_meta)
meta_table <- as.data.frame(meta_table) # collapses any nested names
if(nrow(meta_table) == 1){
simple_html_table(colnames(meta_table), unlist(meta_table[1,]), fontsize = 3, col_widths_px = c(200,250))
} else{
stop(sprintf("in_batch_meta file should have 1:1 key value pairs. Check supplied file '%s' for proper formatting. ",in_batch_meta))
}
} else {
cat("No processing details supplied")
}
| cellranger_version | CellRanger 4.0 |
| mapping_reference | hg38 |
| labeling_method | Seurat v.3.0 |
The following data summarizes parsing of hash tag oligos (HTO’s) per well and filtering for singlet cells based on cross-hash doublet/multiplet identification.
# output warnings
if(length(hto_warning_list) == 0){
hto_warning_list <- "None"
}
simple_html_table(labels = "File Warnings", values = paste(hto_warning_list, collapse = "<br>"), col_widths_px = c(120, 600))
| File Warnings | None |
Expand Code
# Config
config_list<- batchreporter::load_config(in_config)
hash_key <- config_list$hash_key
sample_column_name <- config_list$sample_column_name
# Sample Key
df_key <- data.table::fread(in_key)
pools <- unique(df_key$PoolID)
n_pools <- length(pools)
wells <- unlist(strsplit(unique(df_key$WellID), split = ";"))
n_wells <- length(wells)
samples <- unique(df_key$SampleID)
n_samples <- length(samples)
# Read in all hto key to match hto names to barcodes
hto_key <- system.file(file.path("reference",hash_key), package = "HTOparser") # Parameterize this value
in_hto_key <- fread(hto_key, header = FALSE, col.names = c("hto_barcode","hto_name")) %>%
mutate(hto_order = as.numeric(gsub("HT","", hto_name))) %>%
mutate(hto_name = factor(hto_name, levels = hto_name[order(hto_order)])) %>% # use HT number value to reorder the HT levels
select(-hto_order)
# Read in all hto json, check expected numbers vs input well info
all_hto_json <- list.files(path = in_hto,
pattern = "hto_processing_metrics.json$",
full.names = TRUE, recursive = TRUE)
n_json <- length(all_hto_json)
if(n_json == 0){
stop(sprintf("No 'hto_processing_metrics.json' files found in %s", in_hto))
} else if (n_json < n_wells){
hto_json_warn <- sprintf("Input number of 'hto_processing_metrics.json' files (%s) is fewer than number of wells (%s) in sample key",
n_json, n_wells)
warning(hto_json_warn)
hto_warning_list <- c(hto_warning_list, hto_json_warn)
}
stm(paste0("IN HTO Processing JSON Files :\n\t", paste(all_hto_json, collapse = "\n\t")))
cat(paste0("IN HTO Processing JSON Files :\n\t", paste(all_hto_json, collapse = "\n\t")))
## IN HTO Processing JSON Files :
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W1_hto_processing_metrics.json
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W2_hto_processing_metrics.json
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W3_hto_processing_metrics.json
# Read in all hto matrix files, check expected numbers vs input well info
all_hto_mat <- list.files(path = in_hto,
pattern = "hto_count_matrix.csv.gz$",
full.names = TRUE, recursive = TRUE)
n_hto_mat <- length(all_hto_mat)
if(n_hto_mat == 0){
stop(sprintf("No 'hto_count_matrix.csv.gz' files found in %s", in_hto))
} else if (n_hto_mat < n_wells){
hto_mat_warn <- sprintf("Input number of 'hto_count_matrix.csv.gz' files (%s) is fewer than number of wells (%s) in sample key",
n_json, n_wells)
warning(hto_mat_warn)
hto_warning_list <- c(hto_warning_list, hto_mat_warn)
}
stm(paste0("IN HTO Matrix Files :\n\t", paste(all_hto_mat, collapse = "\n\t")))
cat(paste0("IN HTO Matrix Files :\n\t", paste(all_hto_mat, collapse = "\n\t")))
## IN HTO Matrix Files :
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W1_hto_count_matrix.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W2_hto_count_matrix.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W3_hto_count_matrix.csv.gz
# Read in all hto metadata files, check expected numbers vs input well info
all_hto_meta <- list.files(path = in_hto,
pattern = "hto_category_table.csv.gz$",
full.names = TRUE, recursive = TRUE)
n_hto_meta <- length(all_hto_meta)
if(n_hto_meta == 0){
stop(sprintf("No 'hto_category_table.csv.gz' files found in %s", in_hto))
} else if (n_hto_meta < n_wells){
hto_meta_warn <- sprintf("Input number of 'hto_category_table.csv.gz' files (%s) is fewer than number of wells (%s) in sample key",
n_json, n_wells)
warning(hto_meta_warn)
hto_warning_list <- c(hto_warning_list, hto_meta_warn)
}
stm(paste0("IN HTO Metadata Files :\n\t", paste(all_hto_meta, collapse = "\n\t")))
cat(paste0("IN HTO Metadata Files :\n\t", paste(all_hto_meta, collapse = "\n\t")))
## IN HTO Metadata Files :
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W1_hto_category_table.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W2_hto_category_table.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W3_hto_category_table.csv.gz
# output warnings
if(length(hto_warning_list) == 0){
hto_warning_list <- "None"
}
simple_html_table(labels = "File Warnings", values = paste(hto_warning_list, collapse = "<br>"), col_widths_px = c(120, 600))
stm("Reading in hto metadata .csv files")
hto_meta_list <- lapply(all_hto_meta, fread)
hto_meta_wells <- gsub("_.*","",basename(all_hto_meta))
hto_meta_list <- mapply(function(x,y){x$well_id <- y; x}, hto_meta_list, hto_meta_wells, SIMPLIFY = F)
hto_meta <- do.call(rbind, hto_meta_list)
rm("hto_meta_list")
hto_meta[in_hto_key, on = 'hto_barcode', hto_name := i.hto_name] # merge in the hto names
hto_meta[ , pool_id := gsub("C\\dW\\d","", well_id)]
hto_meta[ , sample_hto:= sprintf("%s\n%s", hto_name, get(sample_column_name))]
hto_meta[ , sample_hto_pool:= sprintf("%s\n%s%s", hto_name, get(sample_column_name), pool_id)]
hto_meta[ , hto_order:= as.numeric(gsub("HT","", hto_name))]
hto_meta[ , sample_hto_pool:= factor(sample_hto_pool, levels = unique(sample_hto_pool[order(pool_id, hto_order)]))]
hto_meta[ , hto_order:= NULL]
hto_meta[ , hto_category:= factor(hto_category, levels = c("no_hash", "singlet", "doublet", "multiplet"))]
# hto_meta_list <- split(hto_meta, hto_meta$well_id)
hto_meta_singlet <- subset(hto_meta, hto_category=="singlet")
# Read in json files
stm("Reading in hto processing metrics json files")
well_hto_json_list <- lapply(all_hto_json, read_hto_well_json)
well_hto_json_df <- do.call(rbind, well_hto_json_list)
well_hto_json_df <- well_hto_json_df %>%
left_join(in_hto_key, by = "hto_barcode") %>%
dplyr::mutate(pool_id_short = gsub(".*-", "", gsub("C.*","", well_id))) %>%
dplyr::mutate(pool_id = gsub("C.*","", well_id)) %>%
dplyr::mutate(sample_hto = paste(hto_name, get(sample_column_name), sep = "\n")) %>%
dplyr::mutate(sample_hto = factor(sample_hto, levels = unique(sample_hto[order(hto_name, pool_id)]))) %>%
dplyr::mutate(hto_name_barcode = paste(hto_name, hto_barcode, sep = "\n")) %>%
dplyr::mutate(hto_name_barcode = factor(hto_name_barcode, levels = unique(hto_name_barcode[order(hto_name)]))) %>%
dplyr::mutate(sample_pool = paste(get(sample_column_name), pool_id_short, sep = "_")) %>%
dplyr::mutate(sample_pool_hto = paste(sample_pool, hto_barcode, sep = "\n"))
remove("well_hto_json_list")
stm("Reading in hash matrixes from labeled h5 and multiplet h5 files")
# read in as data.table, convert to data frame with hto barcodes as rownames
hto_mat_list <- lapply(all_hto_mat, fread)
stm("Formatting hto cutoff data")
hto_mat_list <- lapply(hto_mat_list, data.frame, row.names=1)
# name the list items by well
hto_mat_wells <- gsub("_.*", "", basename(all_hto_mat))
names(hto_mat_list) <- hto_mat_wells
# Join with metadata before merging well in case some cell barcodes are not unique
hto_df_list <- lapply(hto_mat_wells, function(x){
mat <- data.frame(t(hto_mat_list[[x]]))
setDT(mat, keep.rownames = "barcodes")
to_melt <- setdiff(colnames(mat), "barcodes")
mat <- melt(mat, id.vars ="barcodes", variable.name ="hto",
value.name= "count",measure.vars = to_melt)
mat <- mat[hto_meta, on = c("barcodes"= "cell_barcode"), `:=`(pool_id= i.pool_id, well_id=i.well_id)]
})
all_hash_fmt <- do.call(rbind, hto_df_list)
# Merge in metadata
all_hash_fmt <- all_hash_fmt[hto_meta[,.(cell_barcode, well_id, pool_id, hto_barcode, hto_category)],
on =c("barcodes"="cell_barcode","well_id"="well_id","pool_id"="pool_id")]
# Merge in well/hto level hashing info
all_hash_fmt <- all_hash_fmt[well_hto_json_df, on = c(hto="hto_barcode", well_id="well_id", pool_id = "pool_id")]
# Add variables for plotting hto distributions
all_hash_fmt[, log_count := ifelse(count>0, log10(count), NA)]
all_hash_fmt[, log_cutoff := ifelse(is.na(log_count), NA, log10(cutoff))]
all_hash_fmt[, log_count_bin := cut(log_count,
breaks = seq(min(log_count,na.rm=T),
max(log_count,na.rm=T),
length.out = 30))]
# Format labels as "hash not detected" for any well/hash that has no cells with hash count >5
low_hto_cutoff <- 5
all_hash_fmt[, n_count := sum(count > low_hto_cutoff), by = .(well_id, hto)]
all_hash_fmt[, detected := n_count > 0]
all_hash_fmt[, plot_label := ifelse(detected, "", "Hash not detected")]
all_hash_fmt[, log_count := ifelse(detected, log_count, NA)] # censor counts for plotting if overall hash not detected
all_hash_fmt[, log_cutoff := ifelse(detected, log_cutoff, NA)] # censor cutoffs for plotting if overall hash not detected
pools <- unique(all_hash_fmt$pool_id)
n_pools <- length(pools)
HTO UMI counts have been pre-filtered based on cell barcode. Only barcodes associated with cells called by CellRanger are included.
stm("Output total hto umi per pool")
pool_info <- all_hash_fmt %>%
dplyr::group_by(pool_id) %>%
dplyr::summarize(total_hto_umi = formatC(sum(count), big.mark = ",", digits = 0, format = "f"), .groups = "drop")
pool_info %>%
gt::gt() %>%
gt::cols_align(align = "right", columns = 2)
| pool_id | total_hto_umi |
|---|---|
| X070-P1 | 47,366,047 |
stm("Output barplot total hto umi per pool")
hash_by_well <- all_hash_fmt[,.(hto_count=sum(count)),
by = .(pool_id, well_id, hto, sample_pool_hto, hto_name_barcode)]
g <- ggplot(hash_by_well, aes(well_id, hto_count)) +
geom_bar(stat="identity", fill = "blue") +
theme_bw() +
scale_y_continuous(labels = scales::comma) +
guides(fill=guide_legend(title="HTO")) +
theme(axis.text.x = element_text(angle = 90, vjust=0.5)) +
xlab("Wells") +
ylab("Total HTO UMI") +
facet_wrap(~pool_id, ncol = n_pools, scales = "free_x", drop = TRUE)
tempwidth <- max(n_samples*0.5 + n_pools*0.1 + 0.2, 4)
make_subchunk(g, subchunk_name = "hto_totalumi_subchunk",quiet_knit = T,
chunk_opt_list = list(fig.width = tempwidth, fig.height = 3))
(function () { g})()
stm("Output barplot total hto umi per pool per hto")
g2 <- qc_aligned_barplot_facet(hash_by_well,
category_x = "well_id",
name_x = "Wells",
category_y = "hto_name_barcode",
category_name = "hto",
stat = "identity",
variable_y_identity = "hto_count",
name_y = "HTO UMI Counts",
facet_formula = formula("~pool_id"),
scales = "free_x",
drop = TRUE)
tempwidth2 <- n_samples*0.5 + n_pools*0.1 + 2.2
make_subchunk(g2, subchunk_name = "hto_umibyhto_subchunk",quiet_knit = T,
chunk_opt_list = list(fig.width = tempwidth, fig.height = 6))
(function () { g})()
Showing number of cells positively associated with each hash tag for each well. Includes singlets and multiplets.
stm("Calculations for heatmap positive hto per well")
# Expected HTO per well based on input sample sheet
all_wells <- trimws(unique(unlist(strsplit(df_key$WellID, split = ";"))))
all_hto <- in_hto_key$hto_name
df_expected_hto <- expand.grid(well_id = all_wells, hto_name = all_hto) %>%
left_join(in_hto_key, by = "hto_name")
df_expected_hto$hto_expected <- mapply(function(x, y){any(grepl(x, df_key$WellID[df_key$HashTag == y]))},
df_expected_hto$well_id,
df_expected_hto$hto_name)
# N Cells Pos HTO per well, defined as cells above cutoff
df_expected_hto$n_positive_cells <- mapply(function(x, y){
well_hto_json_df$n_pos[which(well_hto_json_df$well_id == x & well_hto_json_df$hto_name == y)]
},
df_expected_hto$well_id,
df_expected_hto$hto_name)
df_expected_hto$n_positive_cells[df_expected_hto$n_positive_cells <= 5] <- NA # change to NA to color undetected tiles uniquely
if(is.list(df_expected_hto$n_positive_cells)){
b_missing <- sapply(df_expected_hto$n_positive_cells, function(x){length(x)==0})
b_not_missing <- !b_missing
v_n_pos <- numeric(nrow(df_expected_hto))
v_n_pos[b_not_missing] <- unlist(df_expected_hto$n_positive_cells[b_not_missing])
v_n_pos[b_missing] <- NA
df_expected_hto$n_positive_cells <- v_n_pos
}
# Flag for cells not detected
df_expected_hto$hto_detection_status <- mapply(function(x, y){
ifelse(x == TRUE & is.na(y),"Missing",
ifelse(x == TRUE & !is.na(y), "Detected",
ifelse(x == FALSE & is.na(y), "Not Used", NA)))
},
df_expected_hto$hto_expected,
df_expected_hto$n_positive_cells)
# Heat Map
# temp_scale <- scales::rescale(c(0, 5, 1, max(df_expected_hto$n_positive_cells)))
stm("Output heatmap positive hto per well")
hm_pos <- ggplot(df_expected_hto, aes(hto_name, y=factor(well_id, levels = sort(unique(well_id), decreasing = TRUE)))) +
geom_tile(aes(fill = n_positive_cells, color = factor(hto_detection_status, levels = c("Not Used", "Detected","Missing"))),
width = 0.7, height = 0.7, size = 1) +
scale_fill_viridis_c(option = "D", na.value = "grey") +
scale_color_manual(name = "hto_detection_status", breaks = c("Not Used", "Detected","Missing"), values = c("grey","black","red"), drop = FALSE) +
scale_x_discrete(position = "top") +
xlab("HTO") +
ylab("Well") +
facet_grid(substr(well_id,1,7) ~., drop = T, scales = "free_y")
if(!exists("n_wells")){
wells <- unlist(strsplit(unique(df_key$WellID), split = ";"))
n_wells <- length(wells)
}
temp_figheight <- n_wells*0.25 + 1
temp_figwidth <- length(unique(df_expected_hto$hto_name))*0.4 + 2
make_subchunk(hm_pos, subchunk_name="hto_positive_matrix_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth))
(function () { g})()
A specific hashtag oligo (HTO) is detected in a well when >5 cells have any raw counts. Calculated cutoffs separate cells with background HTO count levels from cells considered positive for the HTO.
stm("Plotting hto count ridgeplots")
get_plt_ht_temp <- function(n_well_pool, n_hto, title_label_height = 0.5, row_height_constant = 3){
plot_height <- (n_well_pool*0.4 + row_height_constant)*ceiling(n_hto/7) + title_label_height
return(plot_height)
}
# Censor hto if detected in <= 5 cells in the well
for (i in seq_along(pools)){
# pool data
df <- all_hash_fmt %>%
dplyr::filter(pool_id == pools[i]) %>%
dplyr::mutate(well_id = factor(well_id, levels = sort(unique(well_id), decreasing=T)))
# cutoff line segment
df_lines <- df %>%
dplyr::select(well_id, sample_hto, hto, cutoff, log_cutoff) %>%
dplyr::filter(!is.na(log_cutoff)) %>%
dplyr::distinct()
# labels for hash not detected
detected_labels <- df %>%
dplyr::select(well_id, hto, sample_hto, plot_label) %>%
dplyr::distinct()
x_max <- max(all_hash_fmt$log_count, na.rm = T)
x_min <- -0.1
x_label <- (x_max - x_min)/2 + x_min
# Plot
g_ridge <- ggplot(df, aes(x = log_count, y = well_id)) +
ggridges::geom_density_ridges(
scale = 7,
stat = "binline",
binwidth = 0.1,
size = 0.5, # line width
aes(color = well_id), alpha = 0,
na.rm = TRUE) +
geom_segment(data = df_lines, aes(x = log_cutoff, xend = log_cutoff,
y = as.numeric(well_id), yend = as.numeric(well_id) + 0.9,
linetype = "well cutoff"),
color = "black", na.rm = TRUE) +
scale_x_continuous(limits = c(x_min, x_max)) +
scale_y_discrete(expand = c(0.01, 0)) +
scale_linetype_manual("cutoff",values=c("well cutoff"=1)) +
geom_text(data = detected_labels, x= x_label,
aes(y= as.numeric(well_id),label = plot_label),
size = 4, vjust = 0) +
facet_wrap(~sample_hto, ncol = 7) +
ggtitle(pools[i]) +
xlab("log10(Count)") +
ylab("Well ID") +
theme_bw() +
theme(axis.text.y = element_text(size = 12),
axis.title = element_text(size = 18),
plot.title = element_text(size = 24),
strip.text = element_text(size = 12),
legend.text = element_text(size = 12))
# Set Height
temp_height <- get_plt_ht_temp(n_well_pool = length(unique(df$well_id)), n_hto = length(unique(df$hto)))
# Output plot with custom chunk format
batchreporter::make_subchunk(g_ridge, subchunk_name = paste0("ridge_", i),
chunk_opt_list = list(fig.height = temp_height, fig.width = 18, warning = FALSE),
quiet_knit = TRUE)
}
(function () { g})()
stm("Plotting hto cutoff boxplots")
g_cutoff_box <- ggplot(well_hto_json_df, aes(sample_hto, cutoff, color = hto_barcode)) +
suppressWarnings(geom_point(alpha = 0.4, position = position_jitter(height = 0, width = 0.3, seed = 20201112),
aes(text = sprintf("Well ID: %s", well_id)))) +
geom_boxplot(alpha = 0, outlier.alpha = 1, color = "black") +
theme_bw()+
theme(axis.text.x = element_text(angle = 90)) +
facet_wrap(~pool_id, drop = TRUE, scales = "free_x", ncol = 2)+
ggtitle("Well-Specific HTO Cutoffs")
temp_figwidth <- (4*((n_pools>1)+1) + 3)
temp_figheight <- ceiling(n_pools/2)*5 + 0.4
ply_cutoff_box <- plotly::ggplotly(g_cutoff_box) %>%
adjust_axis_title_spacing_plotly("x", adjustment = -1.3/temp_figheight) %>%
adjust_axis_title_spacing_plotly("y", adjustment = -0.5/temp_figwidth) %>%
plotly::layout(xaxis = list(title = list(text = "", standoff = 30L)),
yaxis = list(title = list(text = "", standoff = 20L)))
make_subchunk(ply_cutoff_box, subchunk_name = "hto_assignment_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
stm("Plotting hto positive call box plots")
g_cutoff_pos <- ggplot(well_hto_json_df, aes(sample_hto, frac_pos, color = hto_barcode)) +
suppressWarnings(geom_point(alpha = 0.4, position = position_jitter(height = 0, seed = 20201112),
aes(text = sprintf("Well ID: %s", well_id)))) +
geom_boxplot(alpha = 0, outlier.alpha = 1, color = "black") +
theme_bw()+
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_text(margin = margin(c(15,0,1,0)))) +
facet_wrap(~pool_id, drop = TRUE, scales = "free_x", ncol = 2)+
ggtitle("Fraction Positive HTO Calls per Well")
temp_figwidth <- (4*((n_pools>1)+1) + 3)
temp_figheight <- ceiling(n_pools/2)*5 + 0.4
ply_cutoff_pos <- plotly::ggplotly(g_cutoff_pos) %>%
adjust_axis_title_spacing_plotly("x", adjustment = -1.3/temp_figheight) %>%
adjust_axis_title_spacing_plotly("y", adjustment = -0.5/temp_figwidth) %>%
plotly::layout(xaxis = list(title = list(text = "", standoff = 30L)),
yaxis = list(title = list(text = "", standoff = 20L)))
make_subchunk(ply_cutoff_pos, subchunk_name = "hto_positive_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
stm("Output pool based summary table")
pool_summary <- hto_meta[ ,.(n_cells = .N),
by = .(pool_id, hto_category)]
setorder(pool_summary, pool_id, hto_category)
pool_summary[, pct_cells := round(n_cells/sum(n_cells)*100,1), by = pool_id]
setcolorder(pool_summary, c("pool_id", "hto_category", "n_cells", "pct_cells"))
qc_table(pool_summary)
Return to Contents
stm("Plotting hto category counts per well")
g <- qc_aligned_barplot_facet(hto_meta,
category_x = "well_id",
name_x = "Well ID",
category_y = "hto_category",
category_name = "HTO Category",
colorset_y = "varibow",
name_y = "N Cells",
padding = 0.2,
facet_formula = formula("~pool_id"), nrow = 1, scales = "free_x")
temp_figwidth <- max(0.3*n_wells + 0.5*n_pools + 1.5, 4)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
make_subchunk(g, subchunk_name = "hto_cat_counts_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
stm("Plotting hto category fraction per well")
g <- qc_stacked_barplot_facet(hto_meta,
category_x = "well_id",
name_x = "Well ID",
category_y = "hto_category",
category_name = "HTO Category",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE, facet_formula = formula("~pool_id"), ncol=2, scales = "free_x")
temp_figwidth <- max(0.3*n_wells + 0.5*n_pools + 1.5, 4)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
make_subchunk(g, subchunk_name = "hto_cat_fraction_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
Expand data table of summary stats by well
well_summary <- hto_meta[ ,.(n_cells = .N),
by = .(pool_id, well_id, hto_category)]
well_summary[, pct_cells := round(n_cells/sum(n_cells)*100,1), by = c("pool_id", "well_id")]
setorder(well_summary, pool_id, well_id, hto_category)
qc_table(well_summary)
stm("Plotting barcode count per well and hto")
g <- qc_aligned_barplot_facet(meta = hto_meta_singlet,
category_x = "well_id",
name_x = "Well ID",
category_y = "sample_hto_pool",
category_name = "HTO/Sample",
colorset_y = "varibow",
name_y = "Number of Cells",
padding = 0.2,
facet_formula = formula("~pool_id"), nrow = 1, scales = "free", drop = TRUE)
# g <- cowplot::plot_grid(plotlist = plot_list, ncol = 2)
temp_figwidth <- max(0.3*n_wells + 0.5*n_pools + 2, 4)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
batchreporter::make_subchunk(g, subchunk_name = "hto_cat_count_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
stm("Plotting barcode fraction per well and hto")
g <- qc_stacked_barplot_facet(meta = hto_meta_singlet,
category_x = "well_id",
name_x = "Well ID",
category_y = "sample_hto_pool",
category_name = "HTO/Sample",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE,
facet_formula = formula("~pool_id"), nrow = 1, scales = "free_x") +
theme(text = element_text(size = 12))
temp_figwidth <- max(0.3*n_wells + 0.5*n_pools + 1.5, 4)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
batchreporter::make_subchunk(g, subchunk_name = "hto_fraction_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
Expand data table of summary stats by hash tag
stm("Output pool based summary table")
hto_singlet_summary <- hto_meta_singlet[ ,.(n_singlet_cells = .N),
by = setNames(list(pool_id, hto_name, hto_barcode, get(sample_column_name)), c('pool_id', 'hto_name','hto_barcode', sample_column_name))]
setorder(hto_singlet_summary, pool_id, hto_name)
setcolorder(hto_singlet_summary, c("pool_id",sample_column_name, "hto_name", "hto_barcode", "n_singlet_cells"))
qc_table(hto_singlet_summary)
stm("Plotting well count per hto")
plot_list <- list()
for (i in seq_along(pools)){
plot_list[[i]] <- qc_aligned_barplot_facet(meta = hto_meta_singlet[pool_id == pools[i],],
category_x = "sample_hto_pool",
category_y = "well_id",
category_name = "Well ID",
name_x = "HTO/Sample",
colorset_y = "varibow",
name_y = "Number of Cells",
facet_formula = formula("~pool_id"), nrow = 1, scales ="free_x", drop = TRUE)
}
g <- cowplot::plot_grid(plotlist = plot_list, ncol = 2)
temp_figwidth <- max(0.6*n_wells + 1*n_pools + 3, 8)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
batchreporter::make_subchunk(g, subchunk_name = "well_count_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE, class.output = "superbigimage"),
quiet_knit = TRUE)
(function () { g})()
stm("Plotting well fraction per hto")
g <- qc_stacked_barplot_facet(meta = hto_meta_singlet,
category_x = "sample_hto_pool",
category_y = "well_id",
category_name = "Well ID",
name_x = "HTO/Sample",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE ,
facet_formula = formula("~pool_id"), nrow = 1, scales ="free_x", drop = TRUE)
temp_figwidth <- max(0.3*n_wells + 0.5*n_pools + 1.5, 4)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
batchreporter::make_subchunk(g, subchunk_name = "well_fraction_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
HTO batch module v.1.0.0, Lauren Okada
The following metrics summarize the sequencing and alignment by 10x well prior to un-hashing and hash-based cell filtering.
Expand Code
Check Dependencies
Reading in well info from h5 files
if(!exists("well_info")){
# Merge Well Data
stm("Reading in labeled h5 file well data")
well_list <- lapply(all_h5, read_h5_well_meta)
# Find common columns
cols_keep <- Reduce(intersect, lapply(well_list, colnames))
well_list <- lapply(well_list, function(x){x[,cols_keep]})
well_info <- unique(do.call(rbind, well_list))
setDT(well_info)
well_info[, pool_id := gsub("C\\d+W\\d+", "", well_id)]
remove("well_list")
}
Reading in metadata from h5 files
stm("Reading and merging all rna meta data")
rna_meta_list <- lapply(all_h5, H5weaver::read_h5_cell_meta)
# make sure column names are the same
col_list <- lapply(rna_meta_list, colnames)
all_cols_identical <- length(unique(col_list)) == 1
if(!all_cols_identical){
all_columns <- unique(unlist(lapply(rna_meta_list, colnames)))
common_columns <- Reduce(union, lapply(rna_meta_list, colnames))
if(!all(all_columns %in% common_columns)){
stm(sprintf("Warning: rna h5 files do not contain the same meta data columns. Keeping only the common columns. Removing columns %s.",
paste(setdiff(all_columns, common_columns), sep = ", ")))
}
rna_meta_list <- lapply(rna_meta_list, function(x){x[, common_columns]})
}
# merge metadata
rna_meta <- do.call(rbind, rna_meta_list)
# add metadata variables
rna_meta$fct_mito_umi <- rna_meta$n_mito_umi/rna_meta$n_umis
fct_mito_grp_cutoffs <- c(-Inf, 0.05, 0.10, 0.20, 0.30,Inf)
fct_mito_grp_labels <- c("0-0.05","0.05-0.10","0.10-0.20","0.20-0.30",">0.30")
rna_meta$fct_mito_group <- cut(rna_meta$fct_mito_umi, breaks =fct_mito_grp_cutoffs,
labels = fct_mito_grp_labels)
Read in Counts from H5 Files
stm("Reading and merging all rna count matrices")
rna_count_list <- lapply(all_h5, H5weaver::read_h5_dgCMatrix, target = "matrix",
feature_names = "id")
# make sure all matrices have same number of rows
if(!length(unique(sapply(rna_count_list, nrow)))==1){
stop("RNA count matrixes have different numbers of rows")
}
# make sure rows are in same order
row_order <- rownames(rna_count_list[[1]])
rna_count_list <- lapply(rna_count_list, function(x){x[row_order,]})
# make sure columns are in same order as metadata
order_check <- mapply(function(x, y){(all(x$barcodes==colnames(y)))}, rna_meta_list, rna_count_list)
if(!all(order_check)){
# Reorder matrix columns to be consistent with metadata
rna_count_list <- mapply( function(x, y){x[,y$barcodes]}, rna_count_list, rna_meta_list)
}
# merge
rna_counts <- do.call(cbind, rna_count_list)
featDF <- read_h5_feature_meta(all_h5[1])
Sample cells from each well to generate umaps
n_cells_sample <- 2000
stm(sprintf("Sampling %s cells per well (or all cells if fewer)", n_cells_sample))
set.seed(3)
sample_index_list <- lapply(rna_meta_list, function(x){
sample_size = min(n_cells_sample, nrow(x))
sort(sample(1:nrow(x), size = sample_size, replace = FALSE))
})
n_files <- length(sample_index_list)
rna_meta_list_sampled <- lapply(1:n_files, function(x){
rna_meta_list[[x]][sample_index_list[[x]],]
})
rna_count_list_sampled <- lapply(1:n_files, function(x){
rna_count_list[[x]][,sample_index_list[[x]]]
})
rna_meta_sampled <- do.call(rbind, rna_meta_list_sampled)
rna_meta_sampled$fct_mito_umi <- rna_meta_sampled$n_mito_umi/rna_meta_sampled$n_umis
rna_meta_sampled$fct_mito_group <- cut(rna_meta_sampled$fct_mito_umi, breaks =fct_mito_grp_cutoffs,
labels = fct_mito_grp_labels)
rna_counts_sampled <- do.call(cbind, rna_count_list_sampled)
rm(list=c("rna_count_list", "rna_count_list_sampled","rna_meta_list"))
vnames_rna <- c("estimated_number_of_cells", "fraction_reads_in_cells",
"mean_reads_per_cell", "median_genes_per_cell", "median_umi_counts_per_cell",
"number_of_reads", "q30_bases_in_barcode", "q30_bases_in_rna_read",
"q30_bases_in_sample_index", "q30_bases_in_umi", "reads_mapped_antisense_to_gene",
"reads_mapped_confidently_to_exonic_regions", "reads_mapped_confidently_to_genome",
"reads_mapped_confidently_to_intergenic_regions","reads_mapped_confidently_to_intronic_regions",
"reads_mapped_confidently_to_transcriptome", "reads_mapped_to_genome",
"sequencing_saturation", "total_genes_detected", "valid_barcodes")
vnames_arc <- c("estimated_number_of_cells",
"gex_fraction_of_transcriptomic_reads_in_cells","gex_mean_raw_reads_per_cell","gex_median_genes_per_cell",
"gex_median_umi_counts_per_cell","gex_percent_duplicates","gex_q30_bases_in_barcode",
"gex_q30_bases_in_read_2","gex_q30_bases_in_sample_index_i1","gex_q30_bases_in_sample_index_i2",
"gex_q30_bases_in_umi","gex_reads_mapped_antisense_to_gene","gex_reads_mapped_confidently_to_exonic_regions",
"gex_reads_mapped_confidently_to_genome","gex_reads_mapped_confidently_to_intergenic_regions", "gex_reads_mapped_confidently_to_intronic_regions",
"gex_reads_mapped_confidently_to_transcriptome","gex_reads_mapped_to_genome","gex_reads_with_tso",
"gex_sequenced_read_pairs","gex_total_genes_detected","gex_valid_barcodes", "gex_valid_umis")
if(all(vnames_rna %in% colnames(well_info))){
vnames <- vnames_rna
vlabels <- c("Estimated Number of Cells", "Fraction Reads in Cells",
"Mean Reads per Cell", "Median Genes per Cell", "Median UMI per Cell",
"Number of Reads", "Q30 Bases in Barcode (%)", "Q30 Bases in RNA Read (%)",
"Q30 Bases in Sample Index (%)", "Q30 Bases in UMI (%)", "Reads Mapped Antisense to Gene (%)",
"Reads Mapped Confidently to Exonic Regions (%)", "Reads Mapped Confidently to Genome (%)",
"Reads Mapped Confidently to Intergenic Regions (%)",
"Reads Mapped Confidently to Intronic Regions (%)",
"Reads Mapped Confidently to Transcriptome (%)", "Reads Mapped to Genome (%)",
"Sequencing Saturation (%)", "Total Genes Detected", "Valid Barcodes (%)")
n_vars <- length(vnames)
vartypes <- c(rep("Cells", 5), rep("Sequencing", 5), rep("Mapping", 7),"Sequencing","Cells","Sequencing")
vartypes <- factor(vartypes, levels = c("Cells", "Sequencing", "Mapping"))
digitsRound <- c(0, 1, rep(0, 4), rep(1, 12), 0, 1)
rna_data_type <- "rna"
} else if (all(vnames_arc %in% colnames(well_info))){
vnames <- vnames_arc
vlabels <- c("Estimated Number of Cells","Fraction Transcriptomic Reads in Cells",
"Mean Raw Reads per Cell", "Median Genes per Cell", "Median UMI per Cell", "Percent Duplicates",
"Q30 Bases in Barcode", "Q30 Bases in RNA Read 2", "Q30 Bases in Sample Index i1",
"Q30 Bases in Sample Index i2", "Q30 Bases in UMI", "Reads Mapped Antisense to Gene",
"Reads Mapped Confidently to Exonic Regions", "Reads Mapped Confidently to Genome",
"Reads Mapped Confidently to Intergenic Regions",
"Reads Mapped Confidently to Intronic Regions",
"Reads Mapped Confidently to Transcriptome", "Reads Mapped to Genome", "Reads with TSO",
"Sequenced Read Pairs", "Total Genes Detected", "Valid Barcodes", "Valid UMIs")
n_vars <- length(vnames)
vartypes <- c(rep("Cells", 5), rep("Sequencing", 6), rep("Mapping", 7),"Sequencing","Sequencing","Cells","Sequencing","Sequencing")
digitsRound <- c(0, 3, 0, 0, 0, rep(3, 14), 0, 0, 3, 3)
rna_data_type <- "arc"
} else {
vnames <- setdiff( colnames(well_info), c("well_id","pool_id"))
vlabels <- gsub("_"," ", vnames)
vartypes <- rep(NA, length(vnames))
getDigits <- function(vNumeric){
vChar <- as.character(vNumeric)
is_decimal <- any(grepl("[.]", vChar))
if(is_decimal){
decimal_digits <- nchar(vChar) - (nchar(gsub("[.].*","", vChar))+1)
n_digits <- max(decimal_digits)
} else {
n_digits <- 0
}
n_digits
}
digitsRound <- sapply(subset(well_info, select = vnames), getDigits)
rna_data_type <- "unknown"
}
df_vars <- data.frame(Category = vartypes,
Variable_name = vlabels,
Variable = vnames,
Round = digitsRound)
Summary by pool or by entire non-hashed batch
stm("Creating scrna well cellranger summary table")
if(rna_data_type == "rna"){
pool_info <- well_info %>%
dplyr::group_by(pool_id) %>%
dplyr::summarize(total_cells = formatC(sum(estimated_number_of_cells), big.mark = ",", digits = 0, format = "f"),
total_reads = formatC(sum(number_of_reads), big.mark = ",", digits = 0, format = "f"), .groups = "drop")
} else if (rna_data_type == "arc"){
pool_info <- well_info %>%
dplyr::group_by(pool_id) %>%
dplyr::summarize(total_cells = formatC(sum(estimated_number_of_cells), big.mark = ",", digits = 0, format = "f"),
total_sequenced_read_pairs = formatC(sum(gex_sequenced_read_pairs), big.mark = ",", digits = 0, format = "f"), .groups = "drop")
} else {
print("Warning: No total reads column detected in well metadata")
pool_info <- well_info %>%
dplyr::group_by(pool_id) %>%
dplyr::summarize(total_cells = formatC(sum(estimated_number_of_cells), big.mark = ",", digits = 0, format = "f"),
total_reads = NA, .groups = "drop")
}
names(pool_info) <- stringr::str_to_title(gsub("_", " ", names(pool_info)))
pool_info %>%
gt::gt() %>%
gt::cols_align(align = "right", columns = 2:3)
| Pool Id | Total Cells | Total Sequenced Read Pairs |
|---|---|---|
| X070-P1 | 53,132 | 2,081,986,539 |
stm("Creating detailied scrna well cellranger table")
unique_pools <- sort(unique(well_info$pool_id))
well_summary_table <- well_info %>%
gather(key = Variable, value = Value, all_of(vnames)) %>% # all variables long
full_join(df_vars, by = "Variable") %>%
group_by(pool_id, Category, Variable, Variable_name) %>%
summarize(Median = formatC(median(Value, na.rm=T), big.mark = ",", digits = unique(Round), format = "f"),
Range = get_range(Value, digits = unique(Round), verbose = F),
`CV%` = round(sd(Value)/mean(Value)*100, 1),
.groups = "drop") %>%
arrange(Category, Variable_name) %>%
tidyr::pivot_wider(id_cols = c("Category", "Variable", "Variable_name"),
names_from = pool_id,
values_from = c("Median", "Range", "CV%"),
names_glue = "{pool_id}__{.value}",
names_sort = TRUE) %>%
mutate(Plot = sprintf("[Plot](#%s)", Variable)) %>%
select(Category, Variable_name, contains(unique_pools), Plot) # reorder cols by pools first then stats, keep only the clean var name
gt_table <- well_summary_table %>%
gt::gt() %>%
gt::fmt_markdown(columns = "Plot") %>% # convert the plot column text to markdown to activate links. these links generated with plots below.
gt::cols_width(vars(Category) ~ px(100),
vars(Variable_name) ~ px(150),
ends_with("Median") ~ px(100),
ends_with("Range") ~ px(100),
ends_with("CV%") ~ px(50),
vars(Plot) ~ px(60)) %>%
gt::tab_options(table.font.size = 11, column_labels.font.size = 12) %>%
gt::tab_spanner_delim(delim = "__") %>%
gt::cols_align(align = "right",
columns = c(ends_with("Median"), ends_with("Range"))) %>%
gt::cols_align(align = "center",
columns = vars(Plot)) %>%
gt::cols_label(Variable_name = "Variable")
gt_table
| Category | Variable | X070-P1 | Plot | ||
|---|---|---|---|---|---|
| Median | Range | CV% | |||
| Cells | Estimated Number of Cells | 17,743 | 17,190-18,199 | 2.9 | |
| Cells | Fraction Transcriptomic Reads in Cells | 0.885 | 0.880-0.886 | 0.4 | |
| Cells | Mean Raw Reads per Cell | 39,754 | 35,614-42,379 | 8.7 | |
| Cells | Median Genes per Cell | 1,992 | 1,962-2,050 | 2.2 | |
| Cells | Median UMI per Cell | 4,130 | 4,048-4,297 | 3.1 | |
| Cells | Total Genes Detected | 30,628 | 30,599-30,756 | 0.3 | |
| Mapping | Reads Mapped Antisense to Gene | 0.166 | 0.165-0.171 | 1.7 | |
| Mapping | Reads Mapped Confidently to Exonic Regions | 0.392 | 0.384-0.393 | 1.2 | |
| Mapping | Reads Mapped Confidently to Genome | 0.905 | 0.904-0.905 | 0.1 | |
| Mapping | Reads Mapped Confidently to Intergenic Regions | 0.071 | 0.071-0.072 | 0.3 | |
| Mapping | Reads Mapped Confidently to Intronic Regions | 0.442 | 0.441-0.449 | 1.0 | |
| Mapping | Reads Mapped Confidently to Transcriptome | 0.659 | 0.655-0.661 | 0.5 | |
| Mapping | Reads Mapped to Genome | 0.968 | 0.968-0.970 | 0.1 | |
| Sequencing | Percent Duplicates | 0.775 | 0.758-0.781 | 1.5 | |
| Sequencing | Q30 Bases in Barcode | 0.958 | 0.956-0.959 | 0.2 | |
| Sequencing | Q30 Bases in RNA Read 2 | 0.927 | 0.922-0.930 | 0.4 | |
| Sequencing | Q30 Bases in Sample Index i1 | 0.974 | 0.940-0.974 | 2.0 | |
| Sequencing | Q30 Bases in Sample Index i2 | 0.955 | 0.942-0.958 | 0.9 | |
| Sequencing | Q30 Bases in UMI | 0.959 | 0.957-0.959 | 0.1 | |
| Sequencing | Reads with TSO | 0.119 | 0.116-0.130 | 6.1 | |
| Sequencing | Sequenced Read Pairs | 705,357,243 | 648,135,670-728,493,626 | 6.0 | |
| Sequencing | Valid Barcodes | 0.940 | 0.939-0.941 | 0.1 | |
| Sequencing | Valid UMIs | 0.999 | 0.999-0.999 | 0.0 | |
Expand table of statistics per well
qc_table(well_info)
stm("Generating sequencing and alignment QC plots")
verpal <- hcl.colors(n = n_vars, palette = "viridis")
# Plots
for (i in seq_along(vnames)){
df <- data.table::copy(well_info)
spec <- vnames[i]
slabel <- vlabels[i]
df <- as.data.frame(df)
df$spec_col <- df[,spec]
med_val <- median(df$spec_col)
cv <- round(sd(df$spec_col)/mean(df$spec_col)*100, 2)
n <- sum(!is.na(df$spec_col))
g <- ggplot(df, aes(well_id, spec_col)) +
geom_bar(stat = "identity", fill = verpal[i]) +
geom_hline(yintercept = med_val, linetype = "dashed", color = "red")+
scale_y_continuous(sec.axis = dup_axis(breaks = med_val, labels = med_val, name = ""))+
xlab("Well") +
ylab(slabel) +
facet_wrap(~pool_id, ncol = n_pools, scales = "free_x", drop = TRUE) +
ggtitle(slabel,
subtitle = sprintf("Median=%s CV=%.1f%% N=%s", med_val, cv, n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Plot-specific hyperlink definition
cat(sprintf('\n<a id="%s"></a>', spec), labels = "", sep = "\n")
# Output plot
suppressWarnings(print(g))
# Link back to top of section
cat(" \n[Return to Contents](#rna_seq_well_top)", labels = "", sep = "\n")
}
# Reads per hto cat
stm("Generating read count violin plots")
# Reads per hto plot
g_read <- qc_violin_plot(rna_meta,
category_x = "well_id",
name_x = "Well",
column_y = "n_reads",
name_y = "N Reads per Cell",
log_y = TRUE,
fill = "dodgerblue") +
ggtitle("Reads per Well")
temp_figwidth = max(5, 0.5 + n_wells*0.4)
temp_figheight = 4
batchreporter::make_subchunk(g_read, subchunk_name = "well_read_violin_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth,
warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
# Reads per hto cat
stm("Generating umi count violin plots")
# UMI per hto plot
g_umi <- qc_violin_plot(rna_meta,
category_x = "well_id",
name_x = "Well",
column_y = "n_umis",
name_y = "N UMIs per Cell",
log_y = TRUE,
fill = "purple") +
ggtitle("UMIs per Well")
temp_figwidth = max(5, 0.5 + n_wells*0.4)
temp_figheight = 4
batchreporter::make_subchunk(g_umi, subchunk_name = "well_umi_violin_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth,
warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
# Reads per hto cat
stm("Generating gene count violin plots")
# Reads per hto plot
g_genes <- qc_violin_plot(rna_meta,
category_x = "well_id",
name_x = "Well",
column_y = "n_genes",
name_y = "N Genes per Cell",
log_y = TRUE,
fill = "orangered") +
ggtitle("Genes per Well")
temp_figwidth = max(5, 0.5 + n_wells*0.4)
temp_figheight = 4
batchreporter::make_subchunk(g_genes, subchunk_name = "well_gene_violin_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth,
warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
g_bar <- ggplot(rna_meta, aes(well_id, fill = factor(fct_mito_group,
levels = rev(fct_mito_grp_labels)))) +
geom_bar() +
labs(fill = "Fraction Mitochondrial UMIs") +
ylab("N Cells") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
g_pl <- plotly::ggplotly(g_bar) %>%
layout(hovermode = 'compare')
tempwidth <- n_samples*0.4 + 2
make_subchunk(g_pl, subchunk_name = "fraction_mito_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.width = tempwidth, fig.height = 6, echo = FALSE))
stm("Generating mt umi vs umi count scatter plots")
# Reads per hto plot
g_mito_umi <- ggplot(rna_meta, aes(n_umis, fct_mito_umi)) +
geom_point(color = "purple", alpha = 0.2) +
xlab("UMI per Cell (log10 scale)") +
ylab("Fraction Mitochondrial UMI") +
scale_x_log10() +
ggtitle("Fraction Mitochondrial UMI vs UMI per Cell")
g_mito_umi
Expand Per-Well Plot
stm("Generating mt umi vs umi count scatter plots by well")
n_wells_plot <- length(unique(rna_meta$well_id))
ncols_plot <- 6
nrows_plot <- ceiling(n_wells_plot/ncols_plot)
# Reads per hto plot
g_mito_umi_well <- g_mito_umi +
facet_wrap(~well_id, ncol = ncols_plot)
temp_figwidth <- 2*min(n_wells_plot, ncols_plot)+0.3
temp_figheight <- 2*nrows_plot + 0.3
make_subchunk(g_mito_umi_well, subchunk_name = "fraction_mito_umis_well_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.width = temp_figwidth, fig.height = temp_figheight, echo = FALSE))
stm("Generating mt umi vs gene count scatter plots")
# Reads per hto plot
g_mito_gene <- ggplot(rna_meta, aes(n_genes, fct_mito_umi)) +
geom_point(color = "orangered", alpha = 0.2) +
xlab("Genes per Cell (log10 scale)") +
ylab("Fraction Mitochondrial UMI") +
scale_x_log10() +
ggtitle("Fraction Mitochondrial UMI vs UMI per Cell")
g_mito_gene
Expand Per-Well Plot
stm("Generating mt umi vs gene count scatter plots by well")
n_wells_plot <- length(unique(rna_meta$well_id))
ncols_plot <- 6
nrows_plot <- ceiling(n_wells_plot/ncols_plot)
# Reads per hto plot
g_mito_gene_well <- g_mito_gene +
facet_wrap(~well_id, ncol = ncols_plot)
temp_figwidth <- 2*min(n_wells_plot, ncols_plot)+0.3
temp_figheight <- 2*nrows_plot + 0.3
make_subchunk(g_mito_gene_well, subchunk_name = "fraction_mito_genes_well_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.width = temp_figwidth, fig.height = temp_figheight, echo = FALSE))
Expand code
# Create Seurat object
stm("Creating Seurat object from merged data for scRNA UMAP")
merged_so <- Seurat::CreateSeuratObject(counts = rna_counts_sampled)
# Normalize data
stm("Normalizing data for scRNA UMAP")
merged_so <- Seurat::NormalizeData(object = merged_so,
normalization.method = "LogNormalize",
scale.factor = 10000,
margin = 1)
normCounts <- merged_so[["RNA"]]@data
pc_dims <- min(ncol(merged_so) - 1, 50)
# suppressWarnings(future::plan("multiprocess", workers = avail_workers))
stm("Finding Variable Features for scRNA UMAP")
merged_so <- FindVariableFeatures(object = merged_so)
stm("Scaling Data for scRNA UMAP")
merged_so <- ScaleData(object = merged_so, verbose = FALSE)
stm("Running PCA for scRNA UMAP")
merged_so <- RunPCA(object = merged_so, npcs = pc_dims, verbose = FALSE)
# suppressWarnings(future::plan("multiprocess", workers = avail_workers))
stm("Determining dimensionality via jackstraw method for scRNA UMAP")
labels_order <- rna_meta_sampled$well_id[match(colnames(merged_so),rna_meta_sampled$barcodes)]
names(labels_order) <- colnames(merged_so)
jackstraw_cells <- sample_cells(labels_order, 500, seed = 3030)
jackstraw_so <- merged_so[, jackstraw_cells]
jackstraw_so <- JackStraw(object = jackstraw_so,
dims = pc_dims,
num.replicate = 50,
verbose = FALSE)
jackstraw_so <- ScoreJackStraw(object = jackstraw_so,
dims = 1:pc_dims)
pc_pvals <- jackstraw_so@reductions$pca@jackstraw$overall.p.values[,2]
good_pcs <- pc_pvals < 0.05
nPC <- sum(good_pcs)
pc_var <- Stdev(merged_so, reduction = "pca")^2
total_var <- merged_so@reductions$pca@misc$total.variance
var_selected_pc <- sum(pc_var[good_pcs])/total_var
cumvar_string <- sprintf(fmt = "%.1f", var_selected_pc*100)
stm(sprintf("Selected %s significant pcs via JackStraw, %s%% explained variation for scRNA UMAP", nPC, cumvar_string))
pc_embeddings <- merged_so@reductions$pca@cell.embeddings
# stm(sprintf("Using %s principal components for umap",nPC))
# Run UMAP
# suppressWarnings(future::plan("multiprocess", workers = avail_workers))
stm(sprintf("Running UMAP on selected coordinates for scRNA UMAP"))
merged_so <- Seurat::RunUMAP(merged_so,
dims = c(1:50)[good_pcs],
umap.method = "uwot",
seed.use = 3,
verbose = FALSE)
# Get UMAP coordinates
umapDF <- merged_so[["umap"]]@cell.embeddings %>%
as.data.frame() %>%
dplyr::rename(UMAP_1_merged = UMAP_1, UMAP_2_merged = UMAP_2) %>%
tibble::rownames_to_column(var = "barcodes")
umapDF <- merge(umapDF, rna_meta_sampled, by = "barcodes")
rownames(umapDF) <- umapDF$barcodes
umapDF <- umapDF[rownames(merged_so@"meta.data"), , drop = F]
normCounts <- merged_so[["RNA"]]@data
if(!all(rownames(umapDF) == colnames(normCounts))){
stop("Merged UMAP dataframe does not match columns in normalized counts")
}
Batch-level UMAP of 2000 randomly sampled cells per well using 34 principal components (21.3% explained variance) selected by jackstraw.
stm("Plotting Batch scRNA UMAP by Data Quality Metrics")
# Gene scaling
max_genes <- max(8000, rna_meta$n_genes)
max_umi <- max(60000, rna_meta$n_umis)
point_size <- 0.2
# Cell Types
g_base <- ggplot(umapDF, aes(UMAP_1_merged, UMAP_2_merged))
# fraction mitochondrial umi
stm("Plotting Fraction Mito UMAP")
g1 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "Fraction Mitochondrial UMIs",
point_size = point_size,
color_col = "fct_mito_umi",
scale_color_fun = scale_color_fct_mito)
# N Genes
stm("Plotting N Genes UMAP")
g2 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "N Genes",
point_size = point_size,
color_col = "n_genes",
scale_color_fun = scale_color_genes(max_genes)
)
# N UMIs
stm("Plotting N UMIs UMAP")
g3 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "N UMIs",
point_size = point_size,
color_col = "n_umis",
scale_color_fun = scale_color_umis(max_umi)
)
# Wells
stm("Plotting Well UMAP")
cols_well <- H5weaver::varibow(n_colors = length(unique(umapDF$well_id)))
scale_color_well <- function(...){
scale_color_manual(values = cols_well, ...)
}
g4 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "Well ID",
point_size = point_size,
color_col = "well_id",
scale_color_fun = scale_color_well
) +
guides(colour = guide_legend(override.aes = list(size = 3)))
aligned_plots <- cowplot::align_plots(g1, g2, g3, g4, align = "hv", axis = "tblr") # uniform plot sizing
cowplot::plot_grid(aligned_plots[[1]],
aligned_plots[[2]],
aligned_plots[[3]],
aligned_plots[[4]],
ncol = 2)
scRNA seq report well module v.1.0.0, Lauren Okada
scrna_seq_sample_module_version <- "1.0.0"
Expand Code
# Allow display of large images without shrinking to page width
<style>
.superbigimage{
overflow-x:scroll;
}
.superbigimage img{
max-width: none;
}
</style>
Check analysis dependencies
# Config
config_list<- batchreporter::load_config(in_config)
hash_key <- config_list$hash_key
sample_column_name <- config_list$sample_column_name
# Sample Key
df_key <- data.table::fread(in_key)
pools <- unique(df_key$PoolID)
n_pools <- length(pools)
wells <- unlist(strsplit(unique(df_key$WellID), split = ";"))
n_wells <- length(wells)
samples <- unique(df_key$SampleID)
n_samples <- length(samples)
Reading in metadata from h5 files
stm("Reading and merging all rna meta data")
rna_meta_list <- lapply(all_h5, H5weaver::read_h5_cell_meta)
# make sure column names are the same
col_list <- lapply(rna_meta_list, colnames)
all_cols_identical <- length(unique(col_list)) == 1
if(!all_cols_identical){
all_columns <- unique(unlist(lapply(rna_meta_list, colnames)))
common_columns <- Reduce(union, lapply(rna_meta_list, colnames))
if(!all(all_columns %in% common_columns)){
stm(sprintf("Warning: rna h5 files do not contain the same meta data columns. Keeping only the common columns. Removing columns %s.",
paste(setdiff(all_columns, common_columns), sep = ", ")))
}
rna_meta_list <- lapply(rna_meta_list, function(x){x[, common_columns]})
}
# merge metadata
rna_meta <- do.call(rbind, rna_meta_list)
# add metadata variables
rna_meta$fct_mito_umi <- rna_meta$n_mito_umi/rna_meta$n_umis
fct_mito_grp_cutoffs <- c(-Inf, 0.05, 0.10, 0.20, 0.30,Inf)
fct_mito_grp_labels <- c("0-0.05","0.05-0.10","0.10-0.20","0.20-0.30",">0.30")
rna_meta$fct_mito_group <- cut(rna_meta$fct_mito_umi, breaks =fct_mito_grp_cutoffs,
labels = fct_mito_grp_labels)
Read in Counts from H5 Files
stm("Reading and merging all rna count matrices")
rna_count_list <- lapply(all_h5, H5weaver::read_h5_dgCMatrix, target = "matrix",
feature_names = "id")
# make sure all matrices have same number of rows
if(!length(unique(sapply(rna_count_list, nrow)))==1){
stop("RNA count matrixes have different numbers of rows")
}
# make sure rows are in same order
row_order <- rownames(rna_count_list[[1]])
rna_count_list <- lapply(rna_count_list, function(x){x[row_order,]})
# make sure columns are in same order as metadata
order_check <- mapply(function(x, y){(all(x$barcodes==colnames(y)))}, rna_meta_list, rna_count_list)
if(!all(order_check)){
# Reorder matrix columns to be consistent with metadata
rna_count_list <- mapply( function(x, y){x[,y$barcodes]}, rna_count_list, rna_meta_list)
}
# merge
rna_counts <- do.call(cbind, rna_count_list)
featDF <- read_h5_feature_meta(all_h5[1])
Read in HTO files
stm("Reading in hto key for hashed RNA analysis")
# Read in all hto key to match hto names to barcodes
hto_key <- system.file(file.path("reference",hash_key), package = "HTOparser") # Parameterize this value
in_hto_key <- fread(hto_key, header = FALSE, col.names = c("hto_barcode","hto_name")) %>%
mutate(hto_order = as.numeric(gsub("HT","", hto_name))) %>%
mutate(hto_name = factor(hto_name, levels = hto_name[order(hto_order)])) %>% # use HT number value to reorder the HT levels
select(-hto_order)
# Read in all hto metadata files, check expected numbers vs input well info
all_hto_meta <- list.files(path = in_hto,
pattern = "hto_category_table.csv.gz$",
full.names = TRUE, recursive = TRUE)
n_hto_meta <- length(all_hto_meta)
if(n_hto_meta == 0){
stop(sprintf("No 'hto_category_table.csv.gz' files found in %s", in_hto))
} else if (n_hto_meta < n_wells){
hto_meta_warn <- sprintf("Input number of 'hto_category_table.csv.gz' files (%s) is fewer than number of wells (%s) in sample key",
n_json, n_wells)
warning(hto_meta_warn)
hto_warning_list <- c(hto_warning_list, hto_meta_warn)
}
stm(paste0("IN HTO Metadata Files :\n\t", paste(all_hto_meta, collapse = "\n\t")))
cat(paste0("IN HTO Metadata Files :\n\t", paste(all_hto_meta, collapse = "\n\t")))
## IN HTO Metadata Files :
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W1_hto_category_table.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W2_hto_category_table.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W3_hto_category_table.csv.gz
stm("Reading in hto category metadata for hashed RNA analysis")
hto_meta_list <- lapply(all_hto_meta, fread)
hto_meta_wells <- gsub("_.*","",basename(all_hto_meta))
hto_meta_list <- mapply(function(x,y){x$well_id <- y; x}, hto_meta_list, hto_meta_wells, SIMPLIFY = F)
hto_meta <- do.call(rbind, hto_meta_list)
rm("hto_meta_list")
Add in HTO Metadata
stm("Formatting in hto category metadata for hashed RNA analysis")
hto_meta[in_hto_key, on = 'hto_barcode', hto_name := i.hto_name] # merge in the hto names
hto_meta[ , pool_id := gsub("C\\dW\\d","", well_id)]
hto_meta[ , sample_hto:= sprintf("%s\n%s", hto_name, get(sample_column_name))]
hto_meta[ , sample_hto_pool:= sprintf("%s\n%s%s", hto_name, get(sample_column_name), pool_id)]
hto_meta[ , hto_order:= as.numeric(gsub("HT","", hto_name))]
hto_meta[ , sample_hto_pool:= factor(sample_hto_pool, levels = unique(sample_hto_pool[order(pool_id, hto_order)]))]
hto_meta[ , hto_order:= NULL]
hto_meta[ , hto_category:= factor(hto_category, levels = c("no_hash", "singlet", "doublet", "multiplet"))]
# hto_meta_singlet <- subset(hto_meta, hto_category=="singlet")
stm("Merging hto category metadata with RNA metadata")
rna_hto_meta <- rna_meta %>%
left_join(hto_meta, by = c("original_barcodes"="cell_barcode", "well_id" = "well_id","pool_id"="pool_id"))
hto_meta_singlet <- subset(rna_hto_meta, hto_category=="singlet")
hto_meta_singlet <- as.data.table(hto_meta_singlet)
hto_meta_singlet_list <- split(hto_meta_singlet, by = sample_column_name)
Sample singlet cells from each well and sample to generate umaps
n_cells_sample <- 2000
stm(sprintf("Sampling %s cells per sample and well (or all cells if fewer)", n_cells_sample))
set.seed(3)
sample_index_list <- lapply(hto_meta_singlet_list, function(x){
sample_size = min(n_cells_sample, nrow(x))
sort(sample(1:nrow(x), size = sample_size, replace = FALSE))
})
n_files <- length(sample_index_list)
rna_meta_list_sampled <- lapply(1:n_files, function(x){
hto_meta_singlet_list[[x]][sample_index_list[[x]],]
})
rna_count_list_sampled <- lapply(1:n_files, function(x){
rna_counts[, rna_meta_list_sampled[[x]]$barcodes]
})
rna_meta_sampled <- do.call(rbind, rna_meta_list_sampled)
rna_meta_sampled$fct_mito_umi <- rna_meta_sampled$n_mito_umi/rna_meta_sampled$n_umis
rna_meta_sampled$fct_mito_group <- cut(rna_meta_sampled$fct_mito_umi, breaks =fct_mito_grp_cutoffs,
labels = fct_mito_grp_labels)
rna_counts_sampled <- do.call(cbind, rna_count_list_sampled)
rm(list=c("rna_count_list", "rna_count_list_sampled","rna_meta_list"))
stm("Output pool based summary table")
hto_singlet_summary <- hto_meta_singlet[ ,.(n_singlet_cells = .N,
median_reads = median(n_reads),
median_umis = median(n_umis),
median_genes = median(n_genes)),
by = setNames(list(pool_id, hto_name, hto_barcode, get(sample_column_name)), c('pool_id', 'hto_name','hto_barcode', sample_column_name))]
# by = .(pool_id, hto_name, hto_barcode, get(sample_column_name))]
setorder(hto_singlet_summary, pool_id, hto_name)
setcolorder(hto_singlet_summary, c("pool_id",sample_column_name, "hto_name", "hto_barcode", "n_singlet_cells",
"median_reads", "median_umis", "median_genes"))
qc_table(hto_singlet_summary)
stm("Plotting well count per hto")
plot_list <- list()
for (i in seq_along(pools)){
plot_list[[i]] <- qc_aligned_barplot_facet(meta = hto_meta_singlet[pool_id == pools[i],],
category_x = "sample_hto_pool",
category_y = "well_id",
category_name = "Well ID",
name_x = "HTO/Sample",
colorset_y = "varibow",
name_y = "Number of Cells",
facet_formula = formula("~pool_id"), nrow = 1, scales ="free_x", drop = TRUE)
}
g <- cowplot::plot_grid(plotlist = plot_list, ncol = 2)
temp_figwidth <- max(0.6*n_wells + 1*n_pools + 3, 8)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
batchreporter::make_subchunk(g, subchunk_name = "scrna_sample_hto_count_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE, class.output = "superbigimage"),
quiet_knit = TRUE)
(function () { g})()
stm("Plotting well fraction per hto")
g <- qc_stacked_barplot_facet(meta = hto_meta_singlet,
category_x = "sample_hto_pool",
category_y = "well_id",
category_name = "Well ID",
name_x = "HTO/Sample",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE ,
facet_formula = formula("~pool_id"), nrow = 1, scales ="free_x", drop = TRUE)
temp_figwidth <- max(0.3*n_wells + 0.5*n_pools + 1.5, 4)
temp_figheight <- ceiling(n_pools/2)*4 + 0.4
batchreporter::make_subchunk(g, subchunk_name = "scrna_sample_fraction_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
# Reads per hto cat
stm("Generating sample read count violin plots")
category_reads_violins <- qc_violin_plot(rna_hto_meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue")
# Reads per hto plot
g_read <- qc_violin_plot(hto_meta_singlet,
category_x = "sample_hto_pool",
name_x = "HTO/Sample (singlets)",
column_y = "n_reads",
name_y = "N Reads per Cell, Singlets",
log_y = TRUE,
fill = "dodgerblue") +
ggtitle("Reads per Cell")
# g_read
reads_violin_list <- list(category_reads_violins,
g_read)
g_grid <- plot_grid(plotlist = reads_violin_list,
ncol = 2, rel_widths = c(1, 1+n_samples/4),
nrow = 1, align = "h")
temp_figwidth = 3 + n_samples*0.4
temp_figheight = 4
batchreporter::make_subchunk(g_grid, subchunk_name = "scrna_sample_counts_violin_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth,
warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
Expand plots of reads/cell distributions by hto and well
stm("Plotting read count violin plots per hto and well")
all_sample_hto_pool <- unique(sprintf("%s\n%s%s",df_key$HashTag,df_key$SampleID, df_key$PoolID))
plot_list <- list()
for(i in seq_along(all_sample_hto_pool)){
temp_meta <- filter(hto_meta_singlet, sample_hto_pool == all_sample_hto_pool[i])
g <- qc_violin_plot(temp_meta,
category_x = "well_id",
name_x = "Well",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue") +
theme(text = element_text(size =8)) +
ggtitle(all_sample_hto_pool[i])
plot_list[[i]] <- g
}
g_grid <- cowplot::plot_grid(plotlist = plot_list, align = "tblr", ncol = 6)
temp_figwidth <- 15
temp_figheight <- ceiling(n_samples/6)*4 + 0.4
batchreporter::make_subchunk(g_grid, subchunk_name = "reads_well_hto_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
# UMI per category plot
stm("Generating sample umi count violin plots")
category_umis_violins <- qc_violin_plot(rna_hto_meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple")
# UMI per cell plot
g_umi <- qc_violin_plot(hto_meta_singlet,
category_x = "sample_hto_pool",
name_x = "HTO/Sample (singlets)",
column_y = "n_umis",
name_y = "N UMIs per Cell, Singlets",
fill = "purple") +
ggtitle("UMIs per Cell")
# g_umi
umis_violin_list <- list(category_umis_violins,
g_umi)
g_grid <- plot_grid(plotlist = umis_violin_list,
ncol = 2,
# rel_widths = c(1, 4),
rel_widths = c(1, 1+n_samples/4),
nrow = 1, align = "h")
temp_figwidth = 3 + n_samples*0.4
temp_figheight = 4
batchreporter::make_subchunk(g_grid, subchunk_name = "scrna_sample_umi_violin_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth,
warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
Expand plots of UMIs/cell distributions per per well by hto
stm("Plotting umi count violin plots per hto and well")
all_sample_hto_pool <- unique(sprintf("%s\n%s%s",df_key$HashTag,df_key$SampleID, df_key$PoolID))
plot_list <- list()
for(i in seq_along(all_sample_hto_pool)){
temp_meta <- filter(hto_meta_singlet, sample_hto_pool == all_sample_hto_pool[i])
g <- qc_violin_plot(temp_meta,
category_x = "well_id",
name_x = "Well",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple") +
theme(text = element_text(size =8)) +
ggtitle(all_sample_hto_pool[i])
plot_list[[i]] <- g
}
g_grid <- cowplot::plot_grid(plotlist = plot_list, align = "tblr", ncol = 6)
temp_figwidth <- 15
temp_figheight <- ceiling(n_samples/6)*4 + 0.4
batchreporter::make_subchunk(g_grid, subchunk_name = "umis_well_hto_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
# Genes per category plot
category_genes_violins <- qc_violin_plot(rna_hto_meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered")
# Genes per cell plot
g_genes <- qc_violin_plot(hto_meta_singlet,
category_x = "sample_hto_pool",
name_x = "HTO/Sample (singlets)",
column_y = "n_genes",
name_y = "N Genes per Cell, Singlets",
fill = "orangered") +
ggtitle("Genes per Cell")
genes_violin_list <- list(category_genes_violins,
g_genes)
g_grid <- plot_grid(plotlist = genes_violin_list,
ncol = 2,
rel_widths = c(1, 1+n_samples/4),
# rel_widths = c(1, 4),
nrow = 1, align = "h")
temp_figwidth = 3 + n_samples*0.4
temp_figheight = 4
batchreporter::make_subchunk(g_grid, subchunk_name = "scrna_sample_genes_violin_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth,
warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
Expand plots of genes/cell distributions per per well by hto
stm("Plotting gene count violin plots per hto and well")
all_sample_hto_pool <- unique(sprintf("%s\n%s%s",df_key$HashTag,df_key$SampleID, df_key$PoolID))
plot_list <- list()
for(i in seq_along(all_sample_hto_pool)){
temp_meta <- filter(hto_meta_singlet, sample_hto_pool == all_sample_hto_pool[i])
g <- qc_violin_plot(temp_meta,
category_x = "well_id",
name_x = "Well",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered") +
theme(text = element_text(size =8)) +
ggtitle(all_sample_hto_pool[i])
plot_list[[i]] <- g
}
g_grid <- cowplot::plot_grid(plotlist = plot_list, align = "tblr", ncol = 6)
temp_figwidth <- 15
temp_figheight <- ceiling(n_samples/6)*4 + 0.4
batchreporter::make_subchunk(g_grid, subchunk_name = "genes_well_hto_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
g_bar <- ggplot(hto_meta_singlet, aes(!!as.name(sample_column_name),
fill = factor(fct_mito_group,
levels = rev(fct_mito_grp_labels)))) +
geom_bar() +
labs(fill = "Fraction Mitochondrial UMIs") +
ylab("N Cells") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
g_pl <- plotly::ggplotly(g_bar) %>%
layout(hovermode = 'compare')
tempwidth <- n_samples*0.4 + 2
make_subchunk(g_pl, subchunk_name = "scrna_sample_fraction_mito_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.width = tempwidth, fig.height = 6, echo = FALSE))
# Reads per hto cat
stm("Generating mt umi vs umi count scatter plots by well")
# Reads per hto plot
g_mito_umi <- ggplot(hto_meta_singlet, aes(n_umis, fct_mito_umi)) +
geom_point(color = "purple", alpha = 0.2) +
xlab("UMI per Cell (log10 scale)") +
ylab("Fraction Mitochondrial UMI") +
scale_x_log10() +
ggtitle("Fraction Mitochondrial UMI vs UMI per Cell")
n_samples_plot <- length(unique(hto_meta_singlet[[sample_column_name]]))
ncols_plot <- 6
nrows_plot <- ceiling(n_samples_plot/ncols_plot)
# Reads per hto plot
g_mito_umi_sample <- g_mito_umi +
facet_wrap(as.formula(paste0("~", sample_column_name)),
ncol = ncols_plot)
temp_figwidth <- 2*min(n_samples_plot, ncols_plot)+0.3
temp_figheight <- 2*nrows_plot + 0.3
make_subchunk(g_mito_umi_sample, subchunk_name = "scrna_sample_fraction_mito_umis_sample_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.width = temp_figwidth, fig.height = temp_figheight, echo = FALSE))
Expand Per-Well Plot
# Reads per hto cat
stm("Generating mt umi vs umi count scatter plots per hto and well")
all_sample_hto_pool <- unique(sprintf("%s\n%s%s",df_key$HashTag,df_key$SampleID, df_key$PoolID))
plot_list <- list()
for(i in seq_along(all_sample_hto_pool)){
temp_meta <- hto_meta_singlet %>%
dplyr::filter(sample_hto_pool == all_sample_hto_pool[i])
g <- ggplot(temp_meta, aes(n_umis, fct_mito_umi)) +
geom_point(color = "purple", alpha = 0.2) +
xlab("UMI per Cell (log10 scale)") +
ylab("Fraction Mitochondrial UMI") +
scale_x_log10() +
facet_grid( ~ well_id ) +
ggtitle(all_sample_hto_pool[i])
plot_list[[i]] <- g
}
g_grid <- cowplot::plot_grid(plotlist = plot_list, align = "tblr", ncol = 1)
temp_figwidth <- 15
temp_figheight <- n_samples*4 + 0.4
batchreporter::make_subchunk(g_grid, subchunk_name = "fctmito_umiscatter_well_hto_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
# Genes per hto cat
stm("Generating mt umi vs gene count scatter plots by well")
# Genes per hto plot
g_mito_gene <- ggplot(hto_meta_singlet, aes(n_genes, fct_mito_umi)) +
geom_point(color = "orangered", alpha = 0.2) +
xlab("Genes per Cell (log10 scale)") +
ylab("Fraction Mitochondrial UMI") +
scale_x_log10() +
ggtitle("Fraction Mitochondrial UMI vs Genes per Cell")
n_samples_plot <- length(unique(hto_meta_singlet[[sample_column_name]]))
ncols_plot <- 6
nrows_plot <- ceiling(n_samples_plot/ncols_plot)
# Reads per hto plot
g_mito_genes_sample <- g_mito_gene +
facet_wrap(as.formula(paste0("~", sample_column_name)),
ncol = ncols_plot)
temp_figwidth <- 2*min(n_samples_plot, ncols_plot)+0.3
temp_figheight <- 2*nrows_plot + 0.3
make_subchunk(g_mito_genes_sample, subchunk_name = "scrna_sample_fraction_mito_genes_sample_subchunk", quiet_knit = T,
chunk_opt_list = list(fig.width = temp_figwidth, fig.height = temp_figheight, echo = FALSE))
Expand Per-Well Plot
# Reads per hto cat
stm("Generating mt umi vs gene count scatter plots per hto and well")
all_sample_hto_pool <- unique(sprintf("%s\n%s%s",df_key$HashTag,df_key$SampleID, df_key$PoolID))
plot_list <- list()
for(i in seq_along(all_sample_hto_pool)){
temp_meta <- hto_meta_singlet %>%
dplyr::filter(sample_hto_pool == all_sample_hto_pool[i])
g <- ggplot(temp_meta, aes(n_genes, fct_mito_umi)) +
geom_point(color = "orangered", alpha = 0.2) +
xlab("UMI per Cell (log10 scale)") +
ylab("Fraction Mitochondrial UMI") +
scale_x_log10() +
facet_grid( ~ well_id ) +
ggtitle(all_sample_hto_pool[i])
plot_list[[i]] <- g
}
g_grid <- cowplot::plot_grid(plotlist = plot_list, align = "tblr", ncol = 1)
temp_figwidth <- 15
temp_figheight <- n_samples*4 + 0.4
batchreporter::make_subchunk(g_grid, subchunk_name = "fctmito_genescatter_well_hto_subchunk",
chunk_opt_list = list(fig.height = temp_figheight, fig.width = temp_figwidth, warning = FALSE),
quiet_knit = TRUE)
(function () { g})()
Expand Code
# Create Seurat object
stm("Creating Seurat object from merged data for hashed RNA umap")
merged_so <- Seurat::CreateSeuratObject(counts = rna_counts_sampled)
# Normalize data
stm("Normalizing data for hashed RNA umap")
merged_so <- Seurat::NormalizeData(object = merged_so,
normalization.method = "LogNormalize",
scale.factor = 10000,
margin = 1)
normCounts <- merged_so[["RNA"]]@data
pc_dims <- min(ncol(merged_so) - 1, 50)
# suppressWarnings(future::plan("multiprocess", workers = avail_workers))
stm("Finding Variable Features for hashed RNA umap")
merged_so <- Seurat::FindVariableFeatures(object = merged_so, )
stm("Scaling Data for hashed RNA umap")
merged_so <- Seurat::ScaleData(object = merged_so, verbose = FALSE)
stm("Running PCA for hashed RNA umap")
merged_so <- Seurat::RunPCA(object = merged_so, npcs = pc_dims, verbose = FALSE)
# suppressWarnings(future::plan("multiprocess", workers = avail_workers))
stm("Determining dimensionality via jackstraw method for hashed RNA umap")
labels_order <- hto_meta_singlet[,get(sample_column_name)][match(colnames(merged_so),hto_meta_singlet$barcodes)]
names(labels_order) <- colnames(merged_so)
jackstraw_cells <- sample_cells(labels_order, 100, seed = 3030)
jackstraw_so <- merged_so[,jackstraw_cells]
jackstraw_so <- Seurat::JackStraw(object = jackstraw_so,
dims = pc_dims,
num.replicate = 50,
verbose = FALSE)
jackstraw_so <- Seurat::ScoreJackStraw(object = jackstraw_so,
dims = 1:pc_dims)
pc_pvals <- jackstraw_so@reductions$pca@jackstraw$overall.p.values[,2]
good_pcs <- pc_pvals < 0.05
nPC <- sum(good_pcs)
pc_var <- Stdev(merged_so, reduction = "pca")^2
total_var <- merged_so@reductions$pca@misc$total.variance
var_selected_pc <- sum(pc_var[good_pcs])/total_var
cumvar_string <- sprintf(fmt = "%.1f", var_selected_pc*100)
stm(sprintf("Selected %s significant pcs via JackStraw, %s%% explained variation", nPC, cumvar_string))
pc_embeddings <- merged_so@reductions$pca@cell.embeddings
# stm(sprintf("Using %s principal components for umap",nPC))
# Run UMAP
# suppressWarnings(future::plan("multiprocess", workers = avail_workers))
stm(sprintf("Running UMAP on selected coordinates for hashed RNA umap"))
merged_so <- Seurat::RunUMAP(merged_so,
dims = c(1:50)[good_pcs],
umap.method = "uwot",
seed.use = 3,
verbose = FALSE)
# Get UMAP coordinates
umapDF <- merged_so[["umap"]]@cell.embeddings %>%
as.data.frame() %>%
dplyr::rename(UMAP_1_merged = UMAP_1, UMAP_2_merged = UMAP_2) %>%
tibble::rownames_to_column(var = "barcodes")
umapDF <- merge(umapDF, hto_meta_singlet, by = "barcodes")
rownames(umapDF) <- umapDF$barcodes
umapDF <- umapDF[rownames(merged_so@"meta.data"), , drop = F]
normCounts <- merged_so[["RNA"]]@data
if(!all(rownames(umapDF) == colnames(normCounts))){
stop("Merged UMAP dataframe does not match columns in normalized counts for hashed RNA UMAP")
}
Batch-level UMAP using 32 principal components (21.2% explained variance) selected by jackstraw. Singlet cells only, 2000 randomly sampled cells per Sample ID.
stm("Plotting Batch UMAP by Data Quality Metrics for hashed RNA")
# Gene scaling
max_genes <- max(8000, hto_meta_singlet$n_genes)
max_umi <- max(60000, hto_meta_singlet$n_umis)
point_size <- 0.2
# Cell Types
g_base <- ggplot(umapDF, aes(UMAP_1_merged, UMAP_2_merged))
# fraction mitochondrial umi
stm("Plotting Fraction Mito UMAP")
g3 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "Fraction Mitochondrial UMIs",
point_size = point_size,
color_col = "fct_mito_umi",
scale_color_fun = scale_color_fct_mito)
# N Genes
stm("Plotting N Genes UMAP")
g4 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "N Genes",
point_size = point_size,
color_col = "n_genes",
scale_color_fun = scale_color_genes(max_genes)
)
# N UMIs
stm("Plotting N UMIs UMAP")
g5 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "N UMIs",
point_size = point_size,
color_col = "n_umis",
scale_color_fun = scale_color_umis(max_umi)
)
# Wells
stm("Plotting Well UMAP")
cols_well <- H5weaver::varibow(n_colors = length(unique(umapDF$well_id)))
scale_color_well <- function(...){
scale_color_manual(values = cols_well, ...)
}
g6 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "Well ID",
point_size = point_size,
color_col = "well_id",
scale_color_fun = scale_color_well
) +
guides(colour = guide_legend(override.aes = list(size = 3)))
# Sample
stm("Plotting Sample ID UMAP")
cols_sample <- H5weaver::varibow(n_colors = length(unique(umapDF[,sample_column_name])))
scale_color_sample <- function(...){
scale_color_manual(values = cols_sample, ...)
}
g7 <- plot_umap_report(df = umapDF,
x_col="UMAP_1_merged",
x_lab = "UMAP 1",
y_col = "UMAP_2_merged",
y_lab = "UMAP 2",
title = "Sample ID",
point_size = point_size,
color_col = sample_column_name,
scale_color_fun = scale_color_sample
) +
guides(colour = guide_legend(override.aes = list(size = 3)))
aligned_plots <- cowplot::align_plots(g3, g4, g5, g6, g7, align = "hv", axis = "tblr") # uniform plot sizing
cowplot::plot_grid(aligned_plots[[1]],
aligned_plots[[2]],
aligned_plots[[3]],
aligned_plots[[4]],
aligned_plots[[5]],
ncol = 2)
scRNA seq report sample module v.1.0.0, Lauren Okada
Expand output
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(ggplot2)
quiet_library(Seurat)
quiet_library(viridis)
quiet_library(dplyr)
quiet_library(plyr)
quiet_library(data.table)
stm("Identifying files for ADT analysis")
# Input directory in_adt should be defined in the parent Rmarkdown report.
if(is.null(in_adt)) {
warning("No input directory for adt batch report, defaulting to test data")
in_adt <- system.file("extdata/X070/adt", package = "batchreporter")
}
adt_count_files <- list.files(in_adt,
pattern = "Tag_Counts.csv")
well_names <- gsub('.{15}$', '', adt_count_files)
adt_count_filepaths <- list.files(in_adt, pattern = "Tag_Counts.csv", full.names = T)
adt_pos_count_filepaths <- list.files(in_adt, pattern = "positive_tag_counts.csv", full.names = T)
adt_pos_count_files <- basename(adt_pos_count_filepaths)
cat(sprintf("IN ADT Tag Count files: \n\t%s", paste(adt_count_files, collapse = "\n\t")), sep = "\n")
## IN ADT Tag Count files:
## X070-EP1C1W1_Tag_Counts.csv
## X070-EP1C1W2_Tag_Counts.csv
## X070-EP1C1W3_Tag_Counts.csv
stm(sprintf("IN ADT Tag Count files: \n\t%s", paste(adt_count_files, collapse = "\n\t")))
cat(sprintf("IN POSITIVE ADT Tag Count files: \n\t%s", paste(adt_pos_count_files, collapse = "\n\t")), sep = "\n")
## IN POSITIVE ADT Tag Count files:
## X070-EP1C1W1_adt_positive_tag_counts.csv
## X070-EP1C1W2_adt_positive_tag_counts.csv
## X070-EP1C1W3_adt_positive_tag_counts.csv
stm(sprintf("IN POSITIVE ADT Tag Count files: \n\t%s", paste(adt_pos_count_files, collapse = "\n\t")))
Expand output
# all_merge <- Reduce(merge,seurat_list)
merge_seurat <- function(so_list){
if (length(so_list) > 2){
temp <- merge(x = so_list[[1]], y = so_list[[2]])
i = 3
while (i < length(so_list) + 1){
temp <- merge(x = temp, y = so_list[[i]])
i = i + 1
}
return(temp)
}
else{
temp <- merge(x = so_list[[1]], y = so_list[[2]])
}
return(temp)
}
all_merge <- merge_seurat(seurat_list)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 53132
## Number of edges: 1375642
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9449
## Number of communities: 45
## Elapsed time: 6 seconds
DimPlot(adt_positives, group.by = 'well')
n_features <- length(rownames(adt_positives))
height = ceiling((n_features / 3) * 2)
i = 1
colsums_list <- list()
while (i < length(count_list) + 1){
temp_adt_so <- seurat_list[[i]]
temp_adt_so <- SetIdent(temp_adt_so, value = 'orig.ident')
temp_adt_positives <- subset(temp_adt_so, idents = 'Cells')
temp_adt_pos_mtx <- t(data.frame(temp_adt_positives@assays$RNA@counts))
colsums_df <- data.frame(colSums(temp_adt_pos_mtx))
colnames(colsums_df) <- 'Count'
colsums_df$ADT <- rownames(colsums_df)
colsums_df$Well <- rep(well_names[[i]], length(rownames(colsums_df)))
colsums_list[[i]] <- colsums_df
i = i + 1
}
combined_colsums <- bind_rows(colsums_list, .id = "column_label")
ggplot(combined_colsums, aes(x = reorder(ADT, -Count), y = Count, color = Well)) + geom_point(size = 5) +
geom_hline(yintercept=median(combined_colsums$Count), linetype="dashed", color = "blue") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_y_log10()
ggplot(combined_colsums, aes(x = reorder(ADT, -Count), y = Count, color = Well)) + geom_point(size = 5) +
geom_hline(yintercept=median(combined_colsums$Count), linetype="dashed", color = "blue") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ADT report module v.1.0.0, Zach Thomson
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(ggplot2)
quiet_library(Seurat)
quiet_library(viridis)
quiet_library(dplyr)
quiet_library(plyr)
quiet_library(data.table)
# Input directories in_adt and in_hto should be defined in the parent Rmarkdown report.
# For testing purposes
if(is.null(in_adt)) {
warning("No input directory for adt batch report, defaulting to test data")
in_adt <- system.file("extdata/X070/adt", package = "batchreporter")
}
if(is.null(in_hto)) {
warning("No input directory for hto batch report, defaulting to test data")
in_hto <- system.file("extdata/X070/hto", package = "batchreporter")
}
#create lists for future mapply functions
adt_count_filepaths <- list.files(in_adt, pattern = "tag_counts.csv", full.names = T)
hto_category_filepaths <- list.files(in_hto, pattern = "hto_category_table.csv.gz", full.names = T)
# Check input files
if(length(adt_count_filepaths) == 0) {
stop("Can't find tag_counts.csv files for hashed ADT module processing. Check input 'adt' subdirectory for *tag_counts.csv files.")
}
if(length(hto_category_filepaths) == 0) {
stop("Can't find hto_category_table.csv.gz files for hashed ADT module processing. Check input 'hto' subdirectory for *hto_category_table.csv.gz files.")
}
cat(sprintf("IN ADT Tag Count files: \n\t%s", paste(adt_count_filepaths, collapse = "\n\t")), sep = "\n")
## IN ADT Tag Count files:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/adt/X070-EP1C1W1_adt_positive_tag_counts.csv
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/adt/X070-EP1C1W2_adt_positive_tag_counts.csv
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/adt/X070-EP1C1W3_adt_positive_tag_counts.csv
stm(sprintf("IN ADT Tag Count files: \n\t%s", paste(adt_count_filepaths, collapse = "\n\t")))
cat(sprintf("IN HTO Category files: \n\t%s", paste(hto_category_filepaths, collapse = "\n\t")), sep = "\n")
## IN HTO Category files:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W1_hto_category_table.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W2_hto_category_table.csv.gz
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto/X070-P1C1W3_hto_category_table.csv.gz
stm(sprintf("IN HTO Category files: \n\t%s", paste(hto_category_filepaths, collapse = "\n\t")))
Expand output
violin_plot <- ggplot(combined_df, aes(x = pbmc_sample_id, y = total, fill=pbmc_sample_id)) +
geom_violin() +
scale_y_log10() +
ggtitle('ADT UMIs by Sample')
violin_plot +
stat_summary(fun=median, geom = "point", color="black") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
geom_hline(yintercept=1000, linetype="dashed", color = "blue") +
geom_hline(yintercept=300, linetype="solid", color = "red")
Hashed ADT batch module v.1.0.0, Zach Thomson
start_time_atac <- Sys.time()
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(data.table)
quiet_library(H5weaver)
quiet_library(ggplot2)
quiet_library(cowplot)
quiet_library(jsonlite)
quiet_library(purrr)
options(stringsAsFactors = FALSE)
Declaring start
stm("Starting scATAC Batch Report module")
if(is.null(in_atac)) {
in_atac <- system.file("testdata/batch_qc", package = "ATAComb")
batch_id <- "X055"
out_dir <- tempdir()
}
stm(paste0("IN results dir : ", in_atac))
stm(paste0("IN BatchID : ", batch_id))
stm(paste0("OUT Directory : ", out_dir))
print(c(
paste0("IN results dir : ", in_atac),
paste0("IN BatchID : ", batch_id),
paste0("OUT H5 directory : ", out_dir)
))
## [1] "IN results dir : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac"
## [2] "IN BatchID : X070"
## [3] "OUT H5 directory : /var/folders/zc/cdsbwh5d60d_2h9c9brv90s80000gp/T//Rtmpo06rqu"
if(!dir.exists(in_atac)) {
stm(paste("ERROR: Cannot find IN results dir:", in_atac))
stop()
}
if(!dir.exists(out_dir)) {
stm(paste("Creating output directory:", out_dir))
dir.create(out_dir)
}
out_prefix <- file.path(out_dir, paste0(batch_id, "_"))
Unfiltered metadata
meta_files <- list.files(in_atac,
pattern = "_all_metadata.csv.gz$",
full.names = TRUE)
if(length(meta_files) == 0) {
stop("Can't find unfiltered metadata files. Check input directory for *_all_metadata.csv.gz files.")
}
stm("IN Full Metadata Files:")
for(meta_file in meta_files) {
stm(meta_file)
print(meta_file)
}
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W1_all_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W2_all_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W3_all_metadata.csv.gz"
meta_list <- map(meta_files, fread)
sample_names <- sub(".+/","",sub("_all_metadata.csv.gz","",meta_files))
names(meta_list) <- sample_names
Filtered metadata
filt_meta_files <- list.files(in_atac,
pattern = "_filtered_metadata.csv.gz$",
full.names = TRUE)
if(length(filt_meta_files) < length(meta_files)) {
stop("Can't find filtered metadata files. Check input directory for *_filtered_metadata.csv.gz files.")
} else if(length(filt_meta_files) > length(meta_files)) {
stop("Can't find all metadata files. Check input directory for *_all_metadata.csv.gz files.")
}
stm("IN Filtered Metadata Files:")
for(filt_meta_file in filt_meta_files) {
stm(filt_meta_file)
print(filt_meta_file)
}
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W1_filtered_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W2_filtered_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W3_filtered_metadata.csv.gz"
filt_meta_list <- map(filt_meta_files, fread)
names(filt_meta_list) <- sub(".+/","",sub("_filtered_metadata.csv.gz","",filt_meta_files))
filt_meta_list <- filt_meta_list[sample_names]
Saturation projections
sat_files <- list.files(in_atac,
pattern = "_saturation_projection.csv.gz$",
full.names = TRUE)
if(length(sat_files) < length(meta_files)) {
stop("Can't find all saturation files. Check input directory for *_saturation_projection.csv.gz files.")
} else if(length(sat_files) > length(meta_files)) {
stop("Can't find all metadata files. Check input directory for *_all_metadata.csv.gz files.")
}
stm("IN Saturation Projection Files:")
for(sat_file in sat_files) {
stm(sat_file)
print(sat_file)
}
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W1_saturation_projection.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W2_saturation_projection.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W3_saturation_projection.csv.gz"
names(sat_files) <- sub(".+/","",sub("_saturation_projection.csv.gz","",sat_files))
sat_files <- sat_files[sample_names]
sat_list <- map(sat_files, fread)
Fragment widths
width_files <- list.files(in_atac,
pattern = "_fragment_width_summary.csv.gz",
full.names = TRUE)
if(length(width_files) < length(meta_files)) {
stop("Can't find all fragment width files. Check input directory for *_fragment_width_summary.csv.gz files.")
} else if(length(sat_files) > length(meta_files)) {
stop("Can't find all metadata files. Check input directory for *_all_metadata.csv.gz files.")
}
stm("IN Fragment Width Summary Files:")
for(width_file in width_files) {
stm(width_file)
print(width_file)
}
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W1_fragment_width_summary.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W2_fragment_width_summary.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W3_fragment_width_summary.csv.gz"
names(width_files) <- sub(".+/","",sub("_fragment_width_summary.csv.gz","",width_files))
width_files <- width_files[sample_names]
width_list <- map(width_files, fread)
filtered_meta <- do.call(rbind, filt_meta_list)
meta <- do.call(rbind, meta_list)
cutoffs <- list(altius_frac = 0.5,
tss_frac = 0.2,
peaks_frac = 0.2)
meta$pass_fail <- "pass"
for(i in seq_along(cutoffs)) {
cut_name <- names(cutoffs)[i]
cut_val <- cutoffs[[i]]
cut_logic <- meta[[cut_name]] <= cut_val
meta$pass_fail[cut_logic] <- "fail"
}
meta$filtered <- meta$barcodes %in% filtered_meta$barcodes
meta$mito_frac <- meta$n_mito / meta$n_fragments
meta$well_label <- paste0(meta$well_id, "\n", meta$pbmc_sample_id)
well_samples <- unique(meta[,c("well_id","pbmc_sample_id","well_label")])
stm("Filtering based on QC cutoffs")
meta <- meta %>%
dplyr::left_join(dplyr::select(filtered_meta, barcodes, DoubletScore, DoubletEnrichment, TSSEnrichment), by = "barcodes")
filtered_meta <- meta
for(i in seq_along(cutoffs)) {
cut_name <- names(cutoffs)[i]
cut_val <- cutoffs[[i]]
filtered_meta <- filtered_meta[filtered_meta[[cut_name]] > cut_val]
filtered_meta <- filtered_meta[filtered_meta$filtered,]
}
filtered_meta$well_label <- paste0(filtered_meta$well_id, "\n", filtered_meta$pbmc_sample_id)
well_filtered_meta_list <- split(filtered_meta, filtered_meta$well_id)
Set up global metadata for reporting
meta$barcode_category <- "fail_qc"
meta$barcode_category[!meta$filtered & meta$pass_fail == "pass"] <- "pass_doublet"
meta$barcode_category[meta$filtered & meta$pass_fail == "pass"] <- "pass_singlet"
qc_list <- list(report_type = "atac_batch_qc",
report_datetime = as.character(start_time_atac),
report_uuid = ids::uuid(use_time = TRUE),
package = "ATAComb",
package_version = sessionInfo()$otherPkgs$ATAComb$Version,
batch_id = sub("_.+","",sample_names[1]))
out_json <- paste0(out_prefix, "atac_batch_qc_metrics.json")
barcode_counts <- meta[,.(n_barcodes = nrow(.SD),
n_pass_qc = sum(.SD$pass_fail == "pass"),
n_fail_qc = sum(.SD$pass_fail == "fail"),
percent_fail = round(sum(.SD$pass_fail == "fail")/nrow(.SD)*100,2),
pass_singlets = sum(.SD$barcode_category == "pass_singlet"),
pass_doublets = sum(.SD$barcode_category == "pass_doublet"),
percent_doublets = round(sum(.SD$barcode_category == "pass_doublet")/sum(.SD$pass_fail == "pass")*100,2)),
.(well_id,pbmc_sample_id)]
qc_list$barcode_stats <- as.list(barcode_counts)
qc_table(barcode_counts)
qc_stacked_barplot(meta,
category_x = "well_label",
name_x = "Well ID",
category_y = "barcode_category",
category_name = "Barcode Category",
as_fraction = TRUE)
qc_aligned_barplot(meta,
category_x = "well_label",
name_x = "Well ID",
category_y = "barcode_category",
category_name = "Barcode Category")
qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "DoubletEnrichment",
name_y = "Doublet Enrichment",
log_y = FALSE,
fill = "dodgerblue")
qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "DoubletScore",
name_y = "Doublet Score",
log_y = FALSE,
fill = "dodgerblue")
qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "TSSEnrichment",
name_y = "TSS Enrichment",
log_y = FALSE,
fill = "dodgerblue")
Plot Settings
n_grid_columns <- min(length(well_filtered_meta_list),4)
n_grid_rows <- ceiling(length(well_filtered_meta_list)/4)
grid_width <- n_grid_columns * 3
grid_height <- n_grid_rows * 3
fragment_stats <- filtered_meta[,.(n_singlets = nrow(.SD[.SD$filtered]),
med_raw_frag = round(median(n_fragments),0),
med_raw_perc_mito = round(median(mito_frac)*100,4),
med_unique_frag = round(median(n_unique),0),
med_unique_fritss = round(median(tss_frac),4),
med_unique_frip = round(median(peaks_frac),4),
med_unique_encode = round(median(altius_frac),4)
),
.(well_id,pbmc_sample_id)]
qc_list$fragment_stats <- as.list(fragment_stats)
qc_table(fragment_stats)
category_reads_violins <- qc_violin_plot(meta,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "n_unique",
name_y = "Unique Fragments",
fill = "dodgerblue")
well_reads_violins <- qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "n_unique",
name_y = "Unique Fragments (Singlets)",
fill = "dodgerblue")
reads_violin_list <- list(category_reads_violins,
well_reads_violins)
plot_grid(plotlist = reads_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
category_mito_violins <- qc_violin_plot(meta,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "mito_frac",
name_y = "Fraction Mitochondrial",
fill = "darkgreen",
log_y = FALSE)
well_mito_violins <- qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "mito_frac",
name_y = "Fraction Mito. (Singlets)",
fill = "darkgreen",
log_y = FALSE)
mito_violin_list <- list(category_mito_violins,
well_mito_violins)
plot_grid(plotlist = mito_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
category_frip_violins <- qc_violin_plot(meta,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "peaks_frac",
name_y = "FRIP",
fill = "orangered",
log_y = FALSE)
well_frip_violins <- qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "peaks_frac",
name_y = "FRIP (Singlets)",
fill = "orangered",
log_y = FALSE)
frip_violin_list <- list(category_frip_violins,
well_frip_violins)
plot_grid(plotlist = frip_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
qc_scatter_list <- map(well_filtered_meta_list,
function(well_meta) {
qc_scatter_plot(well_meta,
column_x = "n_unique",
name_x = "N Unique Fragments per Cell",
column_y = "peaks_frac",
name_y = "Frac Fragments in Peaks (peaks_frac)",
log_x = TRUE, log_y = FALSE, frac_y = TRUE,
show_targets = FALSE,
color = "orangered") +
geom_vline(aes(xintercept = 2.5e3), linetype = "dashed", size = 0.2) +
geom_hline(aes(yintercept = cutoffs$peaks_frac), linetype = "dashed", size = 0.2) +
ggtitle(well_meta$well_label[1])
})
plot_grid(plotlist = qc_scatter_list,
ncol = n_grid_columns,
nrow = n_grid_rows)
category_fritss_violins <- qc_violin_plot(meta,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "tss_frac",
name_y = "FRITSS",
fill = "mediumorchid3",
log_y = FALSE)
well_fritss_violins <- qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "tss_frac",
name_y = "FRITSS (Singlets)",
fill = "mediumorchid3",
log_y = FALSE)
fritss_violin_list <- list(category_fritss_violins,
well_fritss_violins)
plot_grid(plotlist = fritss_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
qc_scatter_list <- map(well_filtered_meta_list,
function(well_meta) {
qc_scatter_plot(well_meta,
column_x = "n_unique",
name_x = "N Unique Fragments per Cell",
column_y = "tss_frac",
name_y = "Frac Fragments in TSS (tss_frac)",
log_x = TRUE, log_y = FALSE, frac_y = TRUE,
show_targets = FALSE,
color = "mediumorchid3") +
geom_vline(aes(xintercept = 2.5e3), linetype = "dashed", size = 0.2) +
geom_hline(aes(yintercept = cutoffs$tss_frac), linetype = "dashed", size = 0.2) +
ggtitle(well_meta$well_label[1])
})
plot_grid(plotlist = qc_scatter_list,
ncol = n_grid_columns,
nrow = n_grid_rows)
category_enc_violins <- qc_violin_plot(meta,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "altius_frac",
name_y = "FRIENCODE",
fill = "darkred",
log_y = FALSE)
well_enc_violins <- qc_violin_plot(filtered_meta,
category_x = "well_label",
name_x = "Well ID",
column_y = "altius_frac",
name_y = "FRIENCODE (Singlets)",
fill = "darkred",
log_y = FALSE)
enc_violin_list <- list(category_enc_violins,
well_enc_violins)
plot_grid(plotlist = enc_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
qc_scatter_list <-map(well_filtered_meta_list,
function(well_meta) {
qc_scatter_plot(well_meta,
column_x = "n_unique",
name_x = "N Unique Fragments per Cell",
column_y = "altius_frac",
name_y = "Frac Fragments in Altius (altius_frac)",
log_x = TRUE, log_y = FALSE, frac_y = TRUE,
show_targets = FALSE,
color = "darkred") +
geom_vline(aes(xintercept = 2.5e3), linetype = "dashed", size = 0.2) +
geom_hline(aes(yintercept = cutoffs$altius_frac), linetype = "dashed", size = 0.2) +
ggtitle(well_meta$well_label[1])
})
plot_grid(plotlist = qc_scatter_list,
ncol = n_grid_columns,
nrow = n_grid_rows)
all_sat <- map_dfr(1:length(sat_list),
function(x) {
sat <- sat_list[[x]]
sat_pbmc_sample <- sub(paste0(batch_id,"_"),"",names(sat_list)[x])
sat$pbmc_sample_id <- sat_pbmc_sample
sat$well_id <- well_samples$well_id[well_samples$pbmc_sample_id == sat_pbmc_sample]
sat$well_label <- well_samples$well_label[well_samples$pbmc_sample_id == sat_pbmc_sample]
sat
})
all_sat <- as.data.table(all_sat)
sat_cutoffs <- all_sat[,.(M_raw = max(.SD$M_raw_reads[.SD$type == "actual"]),
M_umis = max(.SD$M_umis[.SD$type == "actual"]),
M_signal = max(.SD$M_signal_umis[.SD$type == "actual"]),
M_raw_2x_UMIs = .SD$M_raw_reads[which.min(abs(.SD$ratio - 2))[1]],
M_raw_4x_UMIs = .SD$M_raw_reads[which.min(abs(.SD$ratio - 4))[1]],
M_raw_2x_signal = .SD$M_raw_reads[which.min(abs(.SD$signal_ratio - 2))[1]],
M_raw_4x_signal = .SD$M_raw_reads[which.min(abs(.SD$signal_ratio - 4))[1]],
M_raw_8x_signal = .SD$M_raw_reads[which.min(abs(.SD$signal_ratio - 8))[1]]),
.(well_id,pbmc_sample_id)]
qc_list$saturation_stats <- as.list(sat_cutoffs)
qc_table(sat_cutoffs)
reference_projections <- fread(system.file("reference/saturation/reference_projections.csv",
package = "ATAComb"))
reference_projections$dataset <- paste0("ref_",reference_projections$dataset)
umi_saturation_plot <- ggplot() +
geom_line(data = reference_projections,
aes(x = n_raw_reads,
y = expected_umis,
group = dataset,
color = dataset)) +
geom_line(data = all_sat,
aes(x = n_raw_reads,
y = expected_umis,
group = well_label,
color = type),
size = 2) +
scale_color_brewer("Data Type",
type = "qual",
palette = 2) +
scale_x_continuous("N Raw Reads (millions)",
expand = c(0,0),
limits = c(0, 1.5e9),
breaks = seq(0, 1.5e9, by = 2.5e8),
labels = seq(0, 1500, by = 250)) +
scale_y_continuous("N Unique Fragments (millions)",
expand = c(0,0),
limits = c(0, 5e8),
breaks = seq(0, 5e8, by = 1e8),
labels = seq(0, 500, by = 100)) +
theme_bw() +
theme(panel.grid.minor = element_blank()) +
facet_wrap(facets = vars(well_label), ncol = 4) +
ggtitle("Saturation of All Unique Fragments")
umi_saturation_plot
Signal fragments are reads in Altius Index regions in barcodes passing QC
signal_saturation_plot <- ggplot() +
geom_line(data = reference_projections,
aes(x = n_raw_reads,
y = signal_umis,
group = dataset,
color = dataset)) +
geom_line(data = all_sat,
aes(x = n_raw_reads,
y = signal_umis,
group = well_label,
color = type),
size = 2) +
scale_color_brewer("Data Type",
type = "qual",
palette = 2) +
scale_x_continuous("N Raw Reads (millions)",
expand = c(0,0),
limits = c(0, 1.5e9),
breaks = seq(0, 1.5e9, by = 2.5e8),
labels = seq(0, 1500, by = 250)) +
scale_y_continuous("N Signal Fragments (millions)",
expand = c(0,0),
limits = c(0, 2e8),
breaks = seq(0, 2e8, by = 2e7),
labels = seq(0, 200, by = 20)) +
theme_bw() +
theme(panel.grid.minor = element_blank()) +
facet_wrap(facets = vars(well_label), ncol = 4) +
ggtitle("Saturation of Signal Fragments")
signal_saturation_plot
Plot Settings
n_grid_columns <- min(length(well_filtered_meta_list),2)
n_grid_rows <- ceiling(length(well_filtered_meta_list)/2)
grid_width <- n_grid_columns * 4
grid_height <- n_grid_rows * 2
frag_widths <- map_dfr(1:length(width_list),
function(x) {
fw <- width_list[[x]]
fw_pbmc_sample <- sub(paste0(batch_id,"_"),"",names(width_list)[x])
fw$pbmc_sample_id <- fw_pbmc_sample
fw$well_id <- well_samples$well_id[well_samples$pbmc_sample_id == fw_pbmc_sample]
fw$well_label <- well_samples$well_label[well_samples$pbmc_sample_id == fw_pbmc_sample]
fw
})
frag_widths <- frag_widths[width <= 750]
ggplot(data = frag_widths) +
geom_line(aes(x = width,
y = frac,
group = pass_fail,
color = pass_fail),
size = 1) +
xlim(0, 750) +
facet_wrap(vars(well_label), ncol = 2) +
theme_bw()
stm(paste0("Writing JSON to ",out_json))
qc_list_json <- jsonlite::toJSON(qc_list,
auto_unbox = TRUE,
pretty = TRUE)
writeLines(qc_list_json,
out_json)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plyr_1.8.6 viridis_0.5.1 viridisLite_0.3.0
## [4] Seurat_3.9.9.9010 future.apply_1.6.0 future_1.19.1
## [7] purrr_0.3.4 jsonlite_1.7.1 tidyr_1.1.2
## [10] plotly_4.9.3 gt_0.2.2 cowplot_1.1.0
## [13] dplyr_1.0.4 ggplot2_3.3.2 HTOparser_1.0.12
## [16] H5weaver_1.2.0 data.table_1.13.2 rhdf5_2.32.4
## [19] Matrix_1.2-18 batchreporter_1.1.0 optparse_1.6.6
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-1 deldir_0.1-29
## [4] ellipsis_0.3.1 ggridges_0.5.2 spatstat.data_2.1-0
## [7] farver_2.0.3 leiden_0.3.3 listenv_0.8.0
## [10] DT_0.16 getopt_1.20.3 ggrepel_0.8.2
## [13] RSpectra_0.16-0 R.methodsS3_1.8.1 codetools_0.2-16
## [16] splines_4.0.3 knitr_1.30 polyclip_1.10-0
## [19] ica_1.0-2 cluster_2.1.0 R.oo_1.24.0
## [22] png_0.1-7 uwot_0.1.9.9000 shiny_1.5.0
## [25] sctransform_0.3.1 compiler_4.0.3 httr_1.4.2
## [28] backports_1.1.10 assertthat_0.2.1 fastmap_1.0.1
## [31] lazyeval_0.2.2 later_1.1.0.1 htmltools_0.5.1.1
## [34] tools_4.0.3 rsvd_1.0.3 igraph_1.2.6
## [37] gtable_0.3.0 glue_1.4.2 RANN_2.6.1
## [40] reshape2_1.4.4 Rcpp_1.0.5 spatstat_1.64-1
## [43] vctrs_0.3.6 nlme_3.1-149 crosstalk_1.1.0.1
## [46] lmtest_0.9-38 xfun_0.18 stringr_1.4.0
## [49] globals_0.13.1 mime_0.9 miniUI_0.1.1.1
## [52] lifecycle_0.2.0 irlba_2.3.3 goftest_1.2-2
## [55] ids_1.0.1 MASS_7.3-53 zoo_1.8-8
## [58] scales_1.1.1 promises_1.1.1 spatstat.utils_2.1-0
## [61] parallel_4.0.3 RColorBrewer_1.1-2 yaml_2.2.1
## [64] reticulate_1.16 pbapply_1.4-3 gridExtra_2.3
## [67] sass_0.3.1 rpart_4.1-15 stringi_1.5.3
## [70] checkmate_2.0.0 commonmark_1.7 rlang_0.4.10
## [73] pkgconfig_2.0.3 matrixStats_0.57.0 evaluate_0.14
## [76] lattice_0.20-41 ROCR_1.0-11 tensor_1.5
## [79] Rhdf5lib_1.10.1 labeling_0.4.2 patchwork_1.0.1
## [82] htmlwidgets_1.5.3 tidyselect_1.1.0 RcppAnnoy_0.0.16
## [85] magrittr_1.5 R6_2.4.1 generics_0.0.2
## [88] DBI_1.1.1 mgcv_1.8-33 pillar_1.4.6
## [91] withr_2.3.0 fitdistrplus_1.1-1 survival_3.2-7
## [94] abind_1.4-5 tibble_3.0.4 crayon_1.3.4
## [97] uuid_0.1-4 KernSmooth_2.23-17 rmarkdown_2.4
## [100] grid_4.0.3 digest_0.6.26 xtable_1.8-4
## [103] httpuv_1.5.4 R.utils_2.10.1 munsell_0.5.0
Total time elapsed
end_time <- Sys.time()
diff_time <- end_time - start_time_atac
time_message <- paste0("Elapsed Time: ",
round(diff_time, 3),
" ", units(diff_time))
print(time_message)
## [1] "Elapsed Time: 16.516 secs"
stm(time_message)
stm("10x ATAC Batch QC Report complete.")
scATAC report module 1.0.0, Lucas Graybuck
start_time_atac_hash <- Sys.time()
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(data.table)
quiet_library(H5weaver)
quiet_library(ggplot2)
quiet_library(cowplot)
quiet_library(jsonlite)
quiet_library(purrr)
options(stringsAsFactors = FALSE)
Declaring start
stm("Starting scATAC Cell Hashing Report Module")
if(is.null(in_atac)) {
in_atac <- system.file("testdata/batch_qc", package = "ATAComb")
batch_id <- "X055"
out_dir <- tempdir()
}
stm(paste0("IN ATAC results dir : ", in_atac))
stm(paste0("IN HTO dir : ", in_hto))
stm(paste0("IN BatchID : ", batch_id))
stm(paste0("OUT Directory : ", out_dir))
print(c(
paste0("IN ATAC results dir : ", in_atac),
paste0("IN HTO dir : ", in_hto),
paste0("IN BatchID : ", batch_id),
paste0("OUT Directory : ", out_dir)
))
## [1] "IN ATAC results dir : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac"
## [2] "IN HTO dir : /Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/hto"
## [3] "IN BatchID : X070"
## [4] "OUT Directory : /var/folders/zc/cdsbwh5d60d_2h9c9brv90s80000gp/T//Rtmpo06rqu"
if(!dir.exists(in_atac)) {
stm(paste("ERROR: Cannot find IN results dir:", in_atac))
stop()
}
if(!dir.exists(in_hto)) {
stm(paste("ERROR: Cannot find IN hto dir:", in_hto))
stop()
}
if(!dir.exists(out_dir)) {
stm(paste("Creating output directory:", out_dir))
dir.create(out_dir)
}
out_prefix <- file.path(out_dir, paste0(batch_id, "_"))
Unfiltered metadata
meta_files <- list.files(in_atac,
pattern = "_all_metadata.csv.gz$",
full.names = TRUE)
if(length(meta_files) == 0) {
stop("Can't find unfiltered metadata files. Check input directory for *_all_metadata.csv.gz files.")
}
stm("IN Full Metadata Files:")
for(meta_file in meta_files) {
stm(meta_file)
print(meta_file)
}
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W1_all_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W2_all_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W3_all_metadata.csv.gz"
meta_list <- map(meta_files, fread)
sample_names <- sub(".+/","",sub("_all_metadata.csv.gz","",meta_files))
names(meta_list) <- sample_names
Filtered metadata
filt_meta_files <- list.files(in_atac,
pattern = "_filtered_metadata.csv.gz$",
full.names = TRUE)
if(length(filt_meta_files) < length(meta_files)) {
stop("Can't find filtered metadata files. Check input directory for *_filtered_metadata.csv.gz files.")
} else if(length(filt_meta_files) > length(meta_files)) {
stop("Can't find all metadata files. Check input directory for *_all_metadata.csv.gz files.")
}
stm("IN Filtered Metadata Files:")
for(filt_meta_file in filt_meta_files) {
stm(filt_meta_file)
print(filt_meta_file)
}
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W1_filtered_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W2_filtered_metadata.csv.gz"
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070/scatac/X070_X070-MP1C1W3_filtered_metadata.csv.gz"
filt_meta_list <- map(filt_meta_files, fread)
names(filt_meta_list) <- sub(".+/","",sub("_filtered_metadata.csv.gz","",filt_meta_files))
filt_meta_list <- filt_meta_list[sample_names]
HTO metadata
filtered_meta <- do.call(rbind, filt_meta_list)
meta <- do.call(rbind, meta_list)
cutoffs <- list(altius_frac = 0.5,
tss_frac = 0.2,
peaks_frac = 0.2)
meta$pass_fail <- "pass"
for(i in seq_along(cutoffs)) {
cut_name <- names(cutoffs)[i]
cut_val <- cutoffs[[i]]
cut_logic <- meta[[cut_name]] <= cut_val
meta$pass_fail[cut_logic] <- "fail"
}
meta$filtered <- meta$barcodes %in% filtered_meta$barcodes
meta$mito_frac <- meta$n_mito / meta$n_fragments
meta$well_label <- paste0(meta$well_id, "\n", meta$pbmc_sample_id)
well_samples <- unique(meta[,c("well_id","pbmc_sample_id","well_label")])
stm("Filtering based on QC cutoffs")
meta <- meta %>%
dplyr::left_join(dplyr::select(filtered_meta, barcodes, DoubletScore, DoubletEnrichment, TSSEnrichment), by = "barcodes")
filtered_meta <- meta
for(i in seq_along(cutoffs)) {
cut_name <- names(cutoffs)[i]
cut_val <- cutoffs[[i]]
filtered_meta <- filtered_meta[filtered_meta[[cut_name]] > cut_val]
filtered_meta <- filtered_meta[filtered_meta$filtered,]
}
filtered_meta$well_label <- paste0(filtered_meta$well_id, "\n", filtered_meta$pbmc_sample_id)
well_filtered_meta_list <- split(filtered_meta, filtered_meta$well_id)
Set up global metadata for reporting
meta$barcode_category <- "fail_qc"
meta$barcode_category[!meta$filtered & meta$pass_fail == "pass"] <- "pass_doublet"
meta$barcode_category[meta$filtered & meta$pass_fail == "pass"] <- "pass_singlet"
stm("Formatting HTO meta data")
hto_meta_list <- lapply(all_hto_meta, fread)
hto_meta_wells <- gsub("_.*","",basename(all_hto_meta))
hto_meta_list <- mapply(function(x,y){x$well_id <- y; x}, hto_meta_list, hto_meta_wells, SIMPLIFY = F)
hto_meta <- do.call(rbind, hto_meta_list)
rm("hto_meta_list")
hto_cat_levels <- c("no_hash", "singlet", "doublet", "multiplet")
hto_meta[in_hto_key, on = 'hto_barcode', hto_name := i.hto_name] # merge in the hto names
hto_meta[ , pool_id := gsub("C\\dW\\d","", well_id)]
hto_meta[ , sample_hto:= sprintf("%s\n%s", hto_name, get(sample_column_name))]
hto_meta[ , sample_hto_pool:= sprintf("%s\n%s%s", hto_name, get(sample_column_name), pool_id)]
hto_meta[ , hto_order:= as.numeric(gsub("HT","", hto_name))]
hto_meta[ , sample_hto_pool:= factor(sample_hto_pool, levels = unique(sample_hto_pool[order(pool_id, hto_order)]))]
hto_meta[ , hto_order:= NULL]
hto_meta[ , hto_category:= factor(hto_category, levels = hto_cat_levels)]
# well id to merge in hto info with atac data
hto_meta[ , atac_well_id := gsub("-P","-AP", well_id)]
hto_meta[ , atac_original_barcodes := paste0(cell_barcode, "-1")]
hto_meta_singlet <- subset(hto_meta, hto_category=="singlet")
stm("Merging hto data with ATAC metadata")
# Join in hto metadata for cells in atac dataset, rename some atac variables to avoid confusion
meta_hto <- meta %>%
dplyr::rename(pbmc_sample_id_old = pbmc_sample_id) %>%
dplyr::rename(atac_well_id = well_id) %>%
dplyr::mutate(atac_pool_id = gsub("C.*", "", atac_well_id)) %>%
dplyr::left_join(hto_meta, by = c("original_barcodes"="atac_original_barcodes", "atac_well_id")) %>%
dplyr::mutate(pool_id=ifelse(is.na(pool_id), gsub("-A","-", atac_pool_id),pool_id))
meta_singlet <- meta_hto %>%
filter(hto_category == "singlet")
# Full Join of hto and atac cells to compare existence between the two datasets
meta_hto_full <- meta %>%
dplyr::rename(pbmc_sample_id_old = pbmc_sample_id) %>%
dplyr::rename(atac_well_id = well_id) %>%
dplyr::mutate(atac_pool_id = gsub("C.*", "", atac_well_id)) %>%
dplyr::full_join(hto_meta, by = c("original_barcodes"="atac_original_barcodes", "atac_well_id")) %>%
dplyr::mutate(hto_category = factor(hto_category, levels = c(hto_cat_levels, "hto_missing"))) %>%
dplyr::mutate(hto_category = replace_na(hto_category, replace="hto_missing")) %>%
dplyr::mutate(barcode_category = replace_na(barcode_category, replace="atac_missing")) %>%
dplyr::mutate(pool_id=ifelse(is.na(pool_id), gsub("-A","-", atac_pool_id),pool_id))
stm("Merging hto data with filtered ATAC metadata")
# Join in hto metadata for cells in atac dataset, rename some atac variables to avoid confusion
filtered_meta_hto <- filtered_meta %>%
dplyr::rename(pbmc_sample_id_old = pbmc_sample_id) %>%
dplyr::rename(atac_well_id = well_id) %>%
dplyr::left_join(hto_meta, by = c("original_barcodes"="atac_original_barcodes", "atac_well_id"))
filtered_meta_singlet <- filtered_meta_hto %>%
dplyr::filter(hto_category == "singlet")
well_filtered_meta_singlet_list <- split(filtered_meta_singlet, filtered_meta_singlet$well_id)
sample_filtered_meta_singlet_list <- split(filtered_meta_singlet, filtered_meta_singlet$pbmc_sample_id)
qc_list <- list(report_type = "atac_batch_qc",
report_datetime = as.character(start_time_atac_hash),
report_uuid = ids::uuid(use_time = TRUE),
package = "batchreporter",
package_version = sessionInfo()$otherPkgs$batchreporter$Version,
batch_id = sub("_.+","",sample_names[1]))
out_json <- paste0(out_prefix, "atac_hashing_batch_qc_metrics.json")
Counts in “n_removed_atac” are cells in the HTO library that were filtered out in ATAC preprocessing for failing to meet minimal quality thresholds. Counts in “hto_missing” are cells in the ATAC library that were not detected in the HTO library.
stm("Generating HTO/ATAC QC Category contingency table")
barcode_counts_htocat <- meta_hto_full[,.(n_barcodes = nrow(.SD),
n_pass_qc = sum(.SD$pass_fail == "pass", na.rm = T),
n_fail_qc = sum(.SD$pass_fail == "fail", na.rm = T),
n_removed_atac = sum(is.na(.SD$pass_fail)),
percent_fail = round(sum(.SD$pass_fail == "fail", na.rm=T)/sum(!is.na(.SD$pass_fail))*100,2),
pass_singlets = sum(.SD$barcode_category == "pass_singlet", na.rm=T),
pass_doublets = sum(.SD$barcode_category == "pass_doublet", na.rm=T),
percent_doublets = round(sum(.SD$barcode_category == "pass_doublet", na.rm=T)/sum(.SD$pass_fail == "pass", na.rm=T)*100,2)),
.(atac_well_id, hto_category)]
qc_list$barcode_stats_allhtoatac <- as.list(barcode_counts_htocat)
qc_table(barcode_counts_htocat)
stm("Plotting HTO/ATAC QC Category barplots")
g1 <- qc_aligned_barplot_facet(meta_hto_full, category_x = "barcode_category", category_name = "HTO Category",
category_y = "hto_category", name_y = "N Cells",
name_x = "ATAC Barcode Category",colorset_y = "varibow", facet_formula = as.formula("~pool_id"))
g2 <- qc_aligned_barplot_facet(meta_hto_full, category_x = "hto_category", category_name = "ATAC Barcode Category",
category_y = "barcode_category", name_y = "N Cells",
name_x = "HTO Category",colorset_y = "varibow", facet_formula = as.formula("~pool_id"))
npool <- length(unique(meta_hto_full$pool_id))
temp_figwidth <- 6*npool
temp_figheight <- 4
make_subchunk(g1, subchunk_name = "subchunk_htocat_ataccat",chunk_opt_list = list(fig.width = temp_figwidth, fig.height = temp_figheight))
(function () { g})()
make_subchunk(g2, subchunk_name = "subchunk_ataccat_htocat",chunk_opt_list = list(fig.width = temp_figwidth, fig.height=temp_figheight))
(function () { g})()
# cowplot::plot_grid(g1, g2, align = "hv", axis = "tblr")
stm("Plotting ATAC Doublet Enrichment by HTO Categorys")
gbox1 <- ggplot(meta_hto, aes(hto_category, DoubletEnrichment)) +
geom_point(position = position_jitter(height = 0, width = 0.3), alpha = 0.1, aes(color = hto_category))+
geom_boxplot(alpha = 0, outlier.alpha = 1) +
facet_wrap(~pool_id)
# geom_violin(alpha = 0) +
gbox2 <- ggplot(meta_hto, aes(hto_category, DoubletScore)) +
geom_point(position = position_jitter(height = 0, width = 0.3), alpha = 0.1, aes(color = hto_category))+
geom_boxplot(alpha = 0, outlier.alpha = 1) +
facet_wrap(~pool_id)
# geom_violin(alpha = 0) +
cowplot::plot_grid(gbox1, gbox2, align = "hv")
QC stats for singlet cells as identified by hashing.
barcode_counts <- meta_singlet[,.(n_barcodes = nrow(.SD),
n_pass_qc = sum(.SD$pass_fail == "pass"),
n_fail_qc = sum(.SD$pass_fail == "fail"),
percent_fail = round(sum(.SD$pass_fail == "fail")/nrow(.SD)*100,2),
pass_singlets = sum(.SD$barcode_category == "pass_singlet"),
pass_doublets = sum(.SD$barcode_category == "pass_doublet"),
percent_doublets = round(sum(.SD$barcode_category == "pass_doublet")/sum(.SD$pass_fail == "pass")*100,2)),
.(pool_id, pbmc_sample_id)]
qc_list$barcode_stats <- as.list(barcode_counts)
qc_table(barcode_counts)
qc_stacked_barplot(meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
category_y = "barcode_category",
category_name = "Barcode Category",
as_fraction = TRUE) +
xlab("Fraction Cells")
qc_aligned_barplot_facet(meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
category_y = "barcode_category",
category_name = "Barcode Category", facet_formula = as.formula("~pool_id"))
qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "DoubletEnrichment",
name_y = "Doublet Enrichment",
log_y = FALSE,
fill = "dodgerblue")
qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "DoubletScore",
name_y = "Doublet Score",
log_y = FALSE,
fill = "dodgerblue")
qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "TSSEnrichment",
name_y = "TSS Enrichment",
log_y = FALSE,
fill = "dodgerblue")
Fragment QC stats for all singlet cells as determined by HTO category.
Plot Settings
n_grid_columns <- min(length(sample_filtered_meta_singlet_list),4)
n_grid_rows <- ceiling(length(sample_filtered_meta_singlet_list)/4)
grid_width <- n_grid_columns * 3
grid_height <- n_grid_rows * 3
fragment_stats <- filtered_meta_singlet[,.(n_singlets = nrow(.SD[.SD$filtered]),
med_raw_frag = round(median(n_fragments),0),
med_raw_perc_mito = round(median(mito_frac)*100,4),
med_unique_frag = round(median(n_unique),0),
med_unique_fritss = round(median(tss_frac),4),
med_unique_frip = round(median(peaks_frac),4),
med_unique_encode = round(median(altius_frac),4)
),
.(pool_id, pbmc_sample_id)]
qc_list$fragment_stats <- as.list(fragment_stats)
qc_table(fragment_stats)
# violins for all atac data by atac category
category_reads_violins <- qc_violin_plot(meta_hto,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "n_unique",
name_y = "Unique Fragments",
fill = "dodgerblue")
# violins for all atac data by hto category
hto_reads_violins <- qc_violin_plot(meta_hto,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_unique",
name_y = "Unique Fragments",
fill = "dodgerblue")
well_reads_violins <- qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "n_unique",
name_y = "Unique Fragments (Singlets)",
fill = "dodgerblue")
reads_violin_list <- list(category_reads_violins,
hto_reads_violins,
well_reads_violins)
plot_grid(plotlist = reads_violin_list,
ncol = 3, rel_widths = c(1, 1, 3),
nrow = 1, align = "h")
category_mito_violins <- qc_violin_plot(meta_hto,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "mito_frac",
name_y = "Fraction Mitochondrial",
fill = "darkgreen",
log_y = F)
hto_mito_violins <- qc_violin_plot(meta_hto,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "mito_frac",
name_y = "Fraction Mitochondrial",
fill = "darkgreen",
log_y = F)
sample_mito_violins <- qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "mito_frac",
name_y = "Fraction Mito. (Singlets)",
fill = "darkgreen",
log_y = F)
mito_violin_list <- list(category_mito_violins,
hto_mito_violins,
sample_mito_violins)
plot_grid(plotlist = mito_violin_list,
ncol = 3, rel_widths = c(1, 1, 3),
nrow = 1, align = "h")
category_frip_violins <- qc_violin_plot(meta_hto,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "peaks_frac",
name_y = "FRIP",
fill = "orangered",
log_y = FALSE)
hto_frip_violins <- qc_violin_plot(meta_hto,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "peaks_frac",
name_y = "FRIP",
fill = "orangered",
log_y = FALSE)
sample_frip_violins <- qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "peaks_frac",
name_y = "FRIP (Singlets)",
fill = "orangered",
log_y = FALSE)
frip_violin_list <- list(category_frip_violins,
hto_frip_violins,
sample_frip_violins)
plot_grid(plotlist = frip_violin_list,
ncol = 3, rel_widths = c(1, 1, 3),
nrow = 1, align = "h")
qc_scatter_list <- map(sample_filtered_meta_singlet_list,
function(sample_meta) {
qc_scatter_plot(sample_meta,
column_x = "n_unique",
name_x = "N Unique Fragments per Cell",
column_y = "peaks_frac",
name_y = "Frac Fragments in Peaks (peaks_frac)",
log_x = TRUE, log_y = FALSE, frac_y = TRUE,
show_targets = FALSE,
color = "orangered") +
geom_vline(aes(xintercept = 2.5e3), linetype = "dashed", size = 0.2) +
geom_hline(aes(yintercept = cutoffs$peaks_frac), linetype = "dashed", size = 0.2) +
ggtitle(sprintf("%s %s", sample_meta$pool_id[1], sample_meta$pbmc_sample_id[1]))
})
plot_grid(plotlist = qc_scatter_list,
ncol = n_grid_columns,
nrow = n_grid_rows)
category_fritss_violins <- qc_violin_plot(meta_hto,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "tss_frac",
name_y = "FRITSS",
fill = "mediumorchid3",
log_y = FALSE)
hto_fritss_violins <- qc_violin_plot(meta_hto,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "tss_frac",
name_y = "FRITSS",
fill = "mediumorchid3",
log_y = FALSE)
well_fritss_violins <- qc_violin_plot(filtered_meta_singlet,
category_x = "sample_hto_pool",
name_x = "Sample",
column_y = "tss_frac",
name_y = "FRITSS (Singlets)",
fill = "mediumorchid3",
log_y = FALSE)
fritss_violin_list <- list(category_fritss_violins,
hto_fritss_violins,
well_fritss_violins)
plot_grid(plotlist = fritss_violin_list,
ncol = 3, rel_widths = c(1, 1, 3),
nrow = 1, align = "h")
qc_scatter_list <- map(sample_filtered_meta_singlet_list,
function(sample_meta) {
qc_scatter_plot(sample_meta,
column_x = "n_unique",
name_x = "N Unique Fragments per Cell",
column_y = "tss_frac",
name_y = "Frac Fragments in TSS (tss_frac)",
log_x = TRUE, log_y = FALSE, frac_y = TRUE,
show_targets = FALSE,
color = "mediumorchid3") +
geom_vline(aes(xintercept = 2.5e3), linetype = "dashed", size = 0.2) +
geom_hline(aes(yintercept = cutoffs$tss_frac), linetype = "dashed", size = 0.2) +
ggtitle(sprintf("%s %s", sample_meta$pool_id[1],sample_meta$pbmc_sample_id[1]))
})
plot_grid(plotlist = qc_scatter_list,
ncol = n_grid_columns,
nrow = n_grid_rows)
category_enc_violins <- qc_violin_plot(meta_hto,
category_x = "barcode_category",
name_x = "Barcode Type",
column_y = "altius_frac",
name_y = "FRIENCODE",
fill = "darkred",
log_y = FALSE)
hto_enc_violins <- qc_violin_plot(meta_hto,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "altius_frac",
name_y = "FRIENCODE",
fill = "darkred",
log_y = FALSE)
well_enc_violins <- qc_violin_plot(filtered_meta_singlet,
category_x = "pbmc_sample_id",
name_x = "Sample ID",
column_y = "altius_frac",
name_y = "FRIENCODE (Singlets)",
fill = "darkred",
log_y = FALSE)
enc_violin_list <- list(category_enc_violins,
hto_enc_violins,
well_enc_violins)
plot_grid(plotlist = enc_violin_list,
ncol = 3, rel_widths = c(1, 1, 3),
nrow = 1, align = "h")
qc_scatter_list <-map(sample_filtered_meta_singlet_list,
function(sample_meta) {
qc_scatter_plot(sample_meta,
column_x = "n_unique",
name_x = "N Unique Fragments per Cell",
column_y = "altius_frac",
name_y = "Frac Fragments in Altius (altius_frac)",
log_x = TRUE, log_y = FALSE, frac_y = TRUE,
show_targets = FALSE,
color = "darkred") +
geom_vline(aes(xintercept = 2.5e3), linetype = "dashed", size = 0.2) +
geom_hline(aes(yintercept = cutoffs$altius_frac), linetype = "dashed", size = 0.2) +
ggtitle(sprintf("%s %s", sample_meta$pool_id[1],sample_meta$pbmc_sample_id[1]))
})
plot_grid(plotlist = qc_scatter_list,
ncol = n_grid_columns,
nrow = n_grid_rows)
stm(paste0("Writing JSON to ",out_json))
qc_list_json <- jsonlite::toJSON(qc_list,
auto_unbox = TRUE,
pretty = TRUE)
writeLines(qc_list_json,
out_json)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plyr_1.8.6 viridis_0.5.1 viridisLite_0.3.0
## [4] Seurat_3.9.9.9010 future.apply_1.6.0 future_1.19.1
## [7] purrr_0.3.4 jsonlite_1.7.1 tidyr_1.1.2
## [10] plotly_4.9.3 gt_0.2.2 cowplot_1.1.0
## [13] dplyr_1.0.4 ggplot2_3.3.2 HTOparser_1.0.12
## [16] H5weaver_1.2.0 data.table_1.13.2 rhdf5_2.32.4
## [19] Matrix_1.2-18 batchreporter_1.1.0 optparse_1.6.6
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-1 deldir_0.1-29
## [4] ellipsis_0.3.1 ggridges_0.5.2 spatstat.data_2.1-0
## [7] farver_2.0.3 leiden_0.3.3 listenv_0.8.0
## [10] DT_0.16 getopt_1.20.3 ggrepel_0.8.2
## [13] RSpectra_0.16-0 R.methodsS3_1.8.1 codetools_0.2-16
## [16] splines_4.0.3 knitr_1.30 polyclip_1.10-0
## [19] ica_1.0-2 cluster_2.1.0 R.oo_1.24.0
## [22] png_0.1-7 uwot_0.1.9.9000 shiny_1.5.0
## [25] sctransform_0.3.1 compiler_4.0.3 httr_1.4.2
## [28] backports_1.1.10 assertthat_0.2.1 fastmap_1.0.1
## [31] lazyeval_0.2.2 later_1.1.0.1 htmltools_0.5.1.1
## [34] tools_4.0.3 rsvd_1.0.3 igraph_1.2.6
## [37] gtable_0.3.0 glue_1.4.2 RANN_2.6.1
## [40] reshape2_1.4.4 Rcpp_1.0.5 spatstat_1.64-1
## [43] vctrs_0.3.6 nlme_3.1-149 crosstalk_1.1.0.1
## [46] lmtest_0.9-38 xfun_0.18 stringr_1.4.0
## [49] globals_0.13.1 mime_0.9 miniUI_0.1.1.1
## [52] lifecycle_0.2.0 irlba_2.3.3 goftest_1.2-2
## [55] ids_1.0.1 MASS_7.3-53 zoo_1.8-8
## [58] scales_1.1.1 promises_1.1.1 spatstat.utils_2.1-0
## [61] parallel_4.0.3 RColorBrewer_1.1-2 yaml_2.2.1
## [64] reticulate_1.16 pbapply_1.4-3 gridExtra_2.3
## [67] sass_0.3.1 rpart_4.1-15 stringi_1.5.3
## [70] checkmate_2.0.0 commonmark_1.7 rlang_0.4.10
## [73] pkgconfig_2.0.3 matrixStats_0.57.0 evaluate_0.14
## [76] lattice_0.20-41 ROCR_1.0-11 tensor_1.5
## [79] Rhdf5lib_1.10.1 labeling_0.4.2 patchwork_1.0.1
## [82] htmlwidgets_1.5.3 tidyselect_1.1.0 RcppAnnoy_0.0.16
## [85] magrittr_1.5 R6_2.4.1 generics_0.0.2
## [88] DBI_1.1.1 mgcv_1.8-33 pillar_1.4.6
## [91] withr_2.3.0 fitdistrplus_1.1-1 survival_3.2-7
## [94] abind_1.4-5 tibble_3.0.4 crayon_1.3.4
## [97] uuid_0.1-4 KernSmooth_2.23-17 rmarkdown_2.4
## [100] grid_4.0.3 digest_0.6.26 xtable_1.8-4
## [103] httpuv_1.5.4 R.utils_2.10.1 munsell_0.5.0
Total time elapsed
end_time <- Sys.time()
diff_time <- end_time - start_time_atac_hash
time_message <- paste0("Elapsed Time: ",
round(diff_time, 3),
" ", units(diff_time))
print(time_message)
## [1] "Elapsed Time: 25.469 secs"
stm(time_message)
stm("10x ATAC Batch QC Report complete.")
Hashed scATAC batch QC module 1.0.0, Lucas Graybuck, Lauren Okada
Input Directory:
in_dir
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/X070"
Input Directory Contents:
folders <- list.dirs(in_dir, recursive = FALSE)
file_list <- lapply(folders, function(x){
dir(x, recursive = TRUE)
})
names(file_list) <- basename(folders)
file_list
## $adt
## [1] "X070-EP1C1W1_adt_positive_tag_counts.csv"
## [2] "X070-EP1C1W1_Tag_Counts.csv"
## [3] "X070-EP1C1W2_adt_positive_tag_counts.csv"
## [4] "X070-EP1C1W2_Tag_Counts.csv"
## [5] "X070-EP1C1W3_adt_positive_tag_counts.csv"
## [6] "X070-EP1C1W3_Tag_Counts.csv"
##
## $hto
## [1] "X070-P1C1W1_hto_category_table.csv.gz"
## [2] "X070-P1C1W1_hto_count_matrix.csv.gz"
## [3] "X070-P1C1W1_hto_processing_metrics.json"
## [4] "X070-P1C1W2_hto_category_table.csv.gz"
## [5] "X070-P1C1W2_hto_count_matrix.csv.gz"
## [6] "X070-P1C1W2_hto_processing_metrics.json"
## [7] "X070-P1C1W3_hto_category_table.csv.gz"
## [8] "X070-P1C1W3_hto_count_matrix.csv.gz"
## [9] "X070-P1C1W3_hto_processing_metrics.json"
##
## $scatac
## [1] "X070_X070-MP1C1W1_all_metadata.csv.gz"
## [2] "X070_X070-MP1C1W1_filtered_metadata.csv.gz"
## [3] "X070_X070-MP1C1W1_fragment_width_summary.csv.gz"
## [4] "X070_X070-MP1C1W1_saturation_projection.csv.gz"
## [5] "X070_X070-MP1C1W2_all_metadata.csv.gz"
## [6] "X070_X070-MP1C1W2_filtered_metadata.csv.gz"
## [7] "X070_X070-MP1C1W2_fragment_width_summary.csv.gz"
## [8] "X070_X070-MP1C1W2_saturation_projection.csv.gz"
## [9] "X070_X070-MP1C1W3_all_metadata.csv.gz"
## [10] "X070_X070-MP1C1W3_filtered_metadata.csv.gz"
## [11] "X070_X070-MP1C1W3_fragment_width_summary.csv.gz"
## [12] "X070_X070-MP1C1W3_saturation_projection.csv.gz"
##
## $scrna
## [1] "X070-P1C1W1.h5" "X070-P1C1W2.h5" "X070-P1C1W3.h5"
Batch processing metadata:
in_batch_meta
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/batch-metadata.json"
Config File:
in_config
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/default_rna_config_v1.csv"
Key File:
in_key
## [1] "/Library/Frameworks/R.framework/Versions/4.0/Resources/library/batchreporter/extdata/example_sample_key_X070.csv"
Output Directory:
out_dir
## [1] "/var/folders/zc/cdsbwh5d60d_2h9c9brv90s80000gp/T//Rtmpo06rqu"
Session Info:
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plyr_1.8.6 viridis_0.5.1 viridisLite_0.3.0
## [4] Seurat_3.9.9.9010 future.apply_1.6.0 future_1.19.1
## [7] purrr_0.3.4 jsonlite_1.7.1 tidyr_1.1.2
## [10] plotly_4.9.3 gt_0.2.2 cowplot_1.1.0
## [13] dplyr_1.0.4 ggplot2_3.3.2 HTOparser_1.0.12
## [16] H5weaver_1.2.0 data.table_1.13.2 rhdf5_2.32.4
## [19] Matrix_1.2-18 batchreporter_1.1.0 optparse_1.6.6
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-1 deldir_0.1-29
## [4] ellipsis_0.3.1 ggridges_0.5.2 spatstat.data_2.1-0
## [7] farver_2.0.3 leiden_0.3.3 listenv_0.8.0
## [10] DT_0.16 getopt_1.20.3 ggrepel_0.8.2
## [13] RSpectra_0.16-0 R.methodsS3_1.8.1 codetools_0.2-16
## [16] splines_4.0.3 knitr_1.30 polyclip_1.10-0
## [19] ica_1.0-2 cluster_2.1.0 R.oo_1.24.0
## [22] png_0.1-7 uwot_0.1.9.9000 shiny_1.5.0
## [25] sctransform_0.3.1 compiler_4.0.3 httr_1.4.2
## [28] backports_1.1.10 assertthat_0.2.1 fastmap_1.0.1
## [31] lazyeval_0.2.2 later_1.1.0.1 htmltools_0.5.1.1
## [34] tools_4.0.3 rsvd_1.0.3 igraph_1.2.6
## [37] gtable_0.3.0 glue_1.4.2 RANN_2.6.1
## [40] reshape2_1.4.4 Rcpp_1.0.5 spatstat_1.64-1
## [43] vctrs_0.3.6 nlme_3.1-149 crosstalk_1.1.0.1
## [46] lmtest_0.9-38 xfun_0.18 stringr_1.4.0
## [49] globals_0.13.1 mime_0.9 miniUI_0.1.1.1
## [52] lifecycle_0.2.0 irlba_2.3.3 goftest_1.2-2
## [55] ids_1.0.1 MASS_7.3-53 zoo_1.8-8
## [58] scales_1.1.1 promises_1.1.1 spatstat.utils_2.1-0
## [61] parallel_4.0.3 RColorBrewer_1.1-2 yaml_2.2.1
## [64] reticulate_1.16 pbapply_1.4-3 gridExtra_2.3
## [67] sass_0.3.1 rpart_4.1-15 stringi_1.5.3
## [70] checkmate_2.0.0 commonmark_1.7 rlang_0.4.10
## [73] pkgconfig_2.0.3 matrixStats_0.57.0 evaluate_0.14
## [76] lattice_0.20-41 ROCR_1.0-11 tensor_1.5
## [79] Rhdf5lib_1.10.1 labeling_0.4.2 patchwork_1.0.1
## [82] htmlwidgets_1.5.3 tidyselect_1.1.0 RcppAnnoy_0.0.16
## [85] magrittr_1.5 R6_2.4.1 generics_0.0.2
## [88] DBI_1.1.1 mgcv_1.8-33 pillar_1.4.6
## [91] withr_2.3.0 fitdistrplus_1.1-1 survival_3.2-7
## [94] abind_1.4-5 tibble_3.0.4 crayon_1.3.4
## [97] uuid_0.1-4 KernSmooth_2.23-17 rmarkdown_2.4
## [100] grid_4.0.3 digest_0.6.26 xtable_1.8-4
## [103] httpuv_1.5.4 R.utils_2.10.1 munsell_0.5.0
Total time elapsed
end_time_all <- Sys.time()
diff_time_all <- end_time_all - start_time_all
time_message_all <- paste0("Elapsed Time: ",
round(diff_time_all, 3),
" ", units(diff_time_all))
print(time_message_all)
## [1] "Elapsed Time: 8.08 mins"
stm(time_message_all)
stm("Batch report process complete.")
NGS report v.1.0.0, Lauren Okada